This is an machine learning uni project in the Universität Potsdam.

  • Tutor: Chenpo Hu
  • Date: 26.09.2018

1. Data Loading & Data Understanding

1.1 Load and add Header Row

  • Check Feature representaion as well: sparse etc.
In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import StratifiedShuffleSplit
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#from jupyterthemes import jtplot

#jtplot.style('chesterish',grid=False)

# load data 
#df = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.train.csv')

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

Info = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.info.csv', sep='\t')

print Info

#add a header row to the training data (C1-C86); define the first 3/4 training data as our Train data;
Train = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.train.csv', 
                    sep='\t', names = ["C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD", 
                                       "C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
                                       "C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
                                       "C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD", 
                                       "C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC", 
                                       "C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
                                       "C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
                                       "C42MINKGEM","C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT", 
                                       "C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT", 
                                       "C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
                                       "C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART", 
                                       "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT", 
                                       "C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
                                       "C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
                                       "C84AINBOED", "C85ABYSTAND", "C86CARAVAN"])


#add a header row to the training data (C1-C85); define the last 1/4 training data as our Test data;
#Test = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.train.csv', sep='\t', names = ["C1_MOSTYPE", "C2_MAANTHUI", "C3_MGEMOMV","C4_MGEMLEEF", "C5_MOSHOOFD", "C6_MGODRK","C7_MGODPR", "C8_MGODOV", "C9_MGODGE","C10_MRELGE", "C11_MRELSA", "C12_MRELOV","C13_MFALLEEN", "C14_MFGEKIND", "C15_MFWEKIND","C16_MOPLHOOG", "C17_MOPLMIDD", "C18_MOPLLAAG","C19_MBERHOOG", "C20_MBERZELF", "C21_MBERBOER","C22_MBERMIDD", "C23_MBERARBG", "C24_MBERARBO","C25_MSKA", "C26_MSKB1", "C27_MSKB2","C28_MSKC", "C29_MSKD", "C30_MHHUUR","C31_MHKOOP", "C32_MAUT1", "C33_MAUT2","C34_ MAUT0", "C35_MZFONDS", "C36_MZPART","C37_MINKM30", "C38_MINK3045", "C39_MINK4575","C40_MINK7512", "C41_MINK123M", "C42_MINKGEM","C43_MKOOPKLA", "C44_PWAPART", "C45_PWABEDR","C46_PWALAND", "C47_PPERSAUT", "C48_PBESAUT","C49_PMOTSCO", "C50_PVRAAUT", "C51_PAANHANG","C52_PTRACTOR", "C53_PWERKT", "C54_PBROM","C55_PLEVEN", "C56_PPERSONG", "C57_PGEZONG","C58_PWAOREG", "C59_PBRAND", "C60_PZEILPL","C61_PPLEZIER", "C62_PFIETS", "C63_PINBOED","C64_PBYSTAND", "C65_AWAPART", "C66_AWABEDR","C67_AWALAND","C68_APERSAUT", "C69_ABESAUT", "C70_AMOTSCO","C71_AVRAAUT", "C72_AAANHANG", "C73_ATRACTOR","C74_AWERKT", "C75_ABROM", "C76_ALEVEN","C77_APERSONG", "C78_AGEZONG", "C79_AWAOREG","C80_ABRAND","C81_AZEILPL", "C82_APLEZIER", "C83_AFIETS","C84_AINBOED", "C85_ABYSTAND", "C86_CARAVAN"],skiprows=4366, nrows=1456)

#The orginal caravan.test.csv was renamed by me to caravan.output.csv
Output = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.output.csv', sep='\t', names = [
                                       "C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD", 
                                       "C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
                                       "C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
                                       "C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD", 
                                       "C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC", 
                                       "C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
                                       "C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
                                       "C42MINKGEM","C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT", 
                                       "C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT", 
                                       "C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
                                       "C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART", 
                                       "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT", 
                                       "C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
                                       "C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
                                       "C84AINBOED", "C85ABYSTAND", "C86CARAVAN"])

OutputdropM = Output[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT", 
                                       "C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT", 
                                       "C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
                                       "C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART", 
                                       "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT", 
                                       "C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
                                       "C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
                                       "C84AINBOED", "C85ABYSTAND"]]
OutputselecdropM = Output[["C47PPERSAUT","C44PWAPART", "C61PPLEZIER", "C43MKOOPKLA","C59PBRAND","C64PBYSTAND","C58PWAOREG","C62PFIETS"]]

OutputdropM2 = Output[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT", 
                                       "C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT", 
                                       "C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
                                       "C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND"]]

#view the 5 rows of the raw data in the training data and the testing data 
print "first 5 rows of Train are:\n", Train.head()
#print "first 5 rows of Test are:\n", Test.head()

#view the raw data of Train and Test
print "Train:\n" , Train
#print "Test:\n", Test
#print "TraindropM:\n", TraindropM
#scaler = preprocessing.StandardScaler().fit(Train)
c:\users\chenp\anaconda2\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
                            DATA DICTIONARY
0                Nr Name Description Domain
1         1 MOSTYPE Customer Subtype see L0
2        2 MAANTHUI Number of houses 1 � 10
3        3 MGEMOMV Avg size household 1 � 6
4                 4 MGEMLEEF Avg age see L1
5      5 MOSHOOFD Customer main type see L2
6            6 MGODRK Roman catholic see L3
7                   7 MGODPR Protestant ...
8                   8 MGODOV Other religion
9                      9 MGODGE No religion
10                        10 MRELGE Married
11                11 MRELSA Living together
12                 12 MRELOV Other relation
13                      13 MFALLEEN Singles
14   14 MFGEKIND Household without children
15      15 MFWEKIND Household with children
16         16 MOPLHOOG High level education
17       17 MOPLMIDD Medium level education
18        18 MOPLLAAG Lower level education
19                  19 MBERHOOG High status
20                 20 MBERZELF Entrepreneur
21                       21 MBERBOER Farmer
22            22 MBERMIDD Middle management
23            23 MBERARBG Skilled labourers
24          24 MBERARBO Unskilled labourers
25                   25 MSKA Social class A
26                 26 MSKB1 Social class B1
27                 27 MSKB2 Social class B2
28                   28 MSKC Social class C
29                   29 MSKD Social class D
..                                      ...
140                        3 Average Family
141                         4 Career Loners
142                           5 Living well
143                      6 Cruising Seniors
144                 7 Retired and Religeous
145                 8 Family with grown ups
146                 9 Conservative families
147                              10 Farmers
148                                     L3:
149                                    0 0%
150                               1 1 - 10%
151                              2 11 - 23%
152                              3 24 - 36%
153                              4 37 - 49%
154                              5 50 - 62%
155                              6 63 - 75%
156                              7 76 - 88%
157                              8 89 - 99%
158                                  9 100%
159                                     L4:
160                                   0 f 0
161                              1 f 1 � 49
162                             2 f 50 � 99
163                           3 f 100 � 199
164                           4 f 200 � 499
165                           5 f 500 � 999
166                         6 f 1000 � 4999
167                         7 f 5000 � 9999
168                     8 f 10.000 - 19.999
169                          9 f 20.000 - ?

[170 rows x 1 columns]
first 5 rows of Train are:
   C1MOSTYPE  C2MAANTHUI  C3MGEMOMV  C4MGEMLEEF  C5MOSHOOFD  C6MGODRK  \
0         33           1          3           2           8         0   
1         37           1          2           2           8         1   
2         37           1          2           2           8         0   
3          9           1          3           3           3         2   
4         40           1          4           2          10         1   

   C7MGODPR  C8MGODOV  C9MGODGE  C10MRELGE     ...      C77APERSONG  \
0         5         1         3          7     ...                0   
1         4         1         4          6     ...                0   
2         4         2         4          3     ...                0   
3         3         2         4          5     ...                0   
4         4         1         4          7     ...                0   

   C78AGEZONG  C79AWAOREG  C80ABRAND  C81AZEILPL  C82APLEZIER  C83AFIETS  \
0           0           0          1           0            0          0   
1           0           0          1           0            0          0   
2           0           0          1           0            0          0   
3           0           0          1           0            0          0   
4           0           0          1           0            0          0   

   C84AINBOED  C85ABYSTAND  C86CARAVAN  
0           0            0           0  
1           0            0           0  
2           0            0           0  
3           0            0           0  
4           0            0           0  

[5 rows x 86 columns]
Train:
      C1MOSTYPE  C2MAANTHUI  C3MGEMOMV  C4MGEMLEEF  C5MOSHOOFD  C6MGODRK  \
0            33           1          3           2           8         0   
1            37           1          2           2           8         1   
2            37           1          2           2           8         0   
3             9           1          3           3           3         2   
4            40           1          4           2          10         1   
5            23           1          2           1           5         0   
6            39           2          3           2           9         2   
7            33           1          2           3           8         0   
8            33           1          2           4           8         0   
9            11           2          3           3           3         3   
10           10           1          4           3           3         1   
11            9           1          3           3           3         1   
12           33           1          2           3           8         1   
13           41           1          3           3          10         0   
14           23           1          1           2           5         0   
15           33           1          2           3           8         0   
16           38           1          2           3           9         0   
17           22           2          3           3           5         0   
18           13           1          4           2           3         2   
19           31           1          2           4           7         0   
20           33           1          4           3           8         0   
21           33           2          3           3           8         0   
22           13           1          3           2           3         1   
23           34           2          3           2           8         0   
24           13           2          4           3           3         0   
25           33           1          3           3           8         0   
26           37           1          3           3           8         0   
27           40           1          3           3          10         0   
28           31           1          4           2           7         0   
29           33           2          2           3           8         0   
...         ...         ...        ...         ...         ...       ...   
5792         13           1          3           3           3         1   
5793         30           1          3           3           7         0   
5794         33           1          3           2           8         0   
5795         22           1          2           2           5         0   
5796         39           1          4           2           9         0   
5797          8           1          4           3           2         0   
5798         27           1          1           6           6         1   
5799          9           1          3           3           3         0   
5800         31           1          3           3           7         2   
5801          3           1          2           4           1         0   
5802          3           1          2           4           1         1   
5803         36           1          3           3           8         0   
5804         23           1          2           2           5         2   
5805         33           1          3           3           8         0   
5806         38           2          3           2           9         0   
5807         38           2          3           4           9         0   
5808         31           1          3           2           7         2   
5809         35           1          2           3           8         0   
5810         33           1          3           2           8         0   
5811         39           1          3           2           9         1   
5812         40           1          3           4          10         0   
5813          3           1          2           4           1         1   
5814          1           1          3           3           1         1   
5815         13           1          2           3           3         0   
5816          3           1          2           3           1         0   
5817         36           1          1           2           8         0   
5818         35           1          4           4           8         1   
5819         33           1          3           4           8         0   
5820         34           1          3           2           8         0   
5821         33           1          3           3           8         0   

      C7MGODPR  C8MGODOV  C9MGODGE  C10MRELGE     ...      C77APERSONG  \
0            5         1         3          7     ...                0   
1            4         1         4          6     ...                0   
2            4         2         4          3     ...                0   
3            3         2         4          5     ...                0   
4            4         1         4          7     ...                0   
5            5         0         5          0     ...                0   
6            2         0         5          7     ...                0   
7            7         0         2          7     ...                0   
8            1         3         6          6     ...                0   
9            5         0         2          7     ...                0   
10           4         1         4          7     ...                0   
11           3         2         4          7     ...                0   
12           4         1         4          6     ...                0   
13           5         0         4          7     ...                0   
14           6         1         2          1     ...                0   
15           7         0         2          7     ...                0   
16           6         0         3          7     ...                0   
17           5         0         4          7     ...                0   
18           4         0         3          7     ...                0   
19           2         0         7          9     ...                0   
20           6         0         3          9     ...                0   
21           4         2         3          7     ...                0   
22           7         0         2          7     ...                0   
23           7         0         2          7     ...                0   
24           4         2         4          8     ...                0   
25           6         1         2          6     ...                0   
26           5         0         4          7     ...                0   
27           3         0         6          9     ...                0   
28           9         0         0          5     ...                0   
29           7         1         2          5     ...                0   
...        ...       ...       ...        ...     ...              ...   
5792         5         1         3          7     ...                0   
5793         4         1         4          6     ...                0   
5794         2         4         4          8     ...                0   
5795         3         1         6          6     ...                0   
5796         5         1         4          7     ...                0   
5797         6         1         2          7     ...                0   
5798         5         2         3          5     ...                0   
5799         5         0         4          7     ...                0   
5800         3         0         5          4     ...                0   
5801         6         1         3          7     ...                0   
5802         4         1         4          8     ...                0   
5803         6         0         3          8     ...                0   
5804         4         1         3          3     ...                0   
5805         6         1         2          6     ...                0   
5806         6         1         2          7     ...                0   
5807         9         0         0          9     ...                0   
5808         2         0         5          6     ...                0   
5809         7         0         2          4     ...                0   
5810         4         0         5          9     ...                0   
5811         5         1         3          7     ...                0   
5812         7         0         2          6     ...                0   
5813         6         1         2          5     ...                0   
5814         4         2         3          7     ...                0   
5815         2         0         7          5     ...                0   
5816         6         0         3          6     ...                0   
5817         6         1         2          1     ...                0   
5818         4         1         4          6     ...                0   
5819         6         0         3          5     ...                0   
5820         7         0         2          7     ...                0   
5821         6         1         2          7     ...                0   

      C78AGEZONG  C79AWAOREG  C80ABRAND  C81AZEILPL  C82APLEZIER  C83AFIETS  \
0              0           0          1           0            0          0   
1              0           0          1           0            0          0   
2              0           0          1           0            0          0   
3              0           0          1           0            0          0   
4              0           0          1           0            0          0   
5              0           0          0           0            0          0   
6              0           0          0           0            0          0   
7              0           0          0           0            0          0   
8              0           0          0           0            0          0   
9              0           0          1           0            0          0   
10             0           0          0           0            0          0   
11             0           0          1           0            0          0   
12             0           0          0           0            0          0   
13             0           0          0           0            0          0   
14             0           0          1           0            0          0   
15             0           0          1           0            0          0   
16             0           0          0           0            0          0   
17             0           0          1           0            0          0   
18             0           0          1           0            0          0   
19             0           0          0           0            0          0   
20             0           0          0           0            0          0   
21             0           0          1           0            0          0   
22             0           0          1           0            0          0   
23             0           0          0           0            0          0   
24             0           0          0           0            0          1   
25             0           0          1           0            0          0   
26             0           0          0           0            0          0   
27             0           0          1           0            0          0   
28             0           0          0           0            0          0   
29             0           0          1           0            0          0   
...          ...         ...        ...         ...          ...        ...   
5792           0           0          1           0            0          0   
5793           0           0          0           0            0          0   
5794           0           0          1           0            0          0   
5795           0           0          0           0            0          0   
5796           0           0          0           0            0          0   
5797           0           0          1           0            0          0   
5798           0           0          1           0            0          0   
5799           0           0          1           0            0          0   
5800           0           0          1           0            0          0   
5801           0           0          1           0            0          0   
5802           0           0          0           0            0          0   
5803           0           0          0           0            0          0   
5804           0           0          1           0            0          0   
5805           0           0          0           0            0          0   
5806           0           0          1           0            0          0   
5807           0           0          1           0            0          0   
5808           0           0          0           0            0          0   
5809           0           0          0           0            0          0   
5810           0           0          0           0            0          0   
5811           0           0          1           0            0          0   
5812           0           0          2           0            0          0   
5813           0           0          1           0            0          0   
5814           0           0          1           0            0          0   
5815           0           0          1           0            0          0   
5816           0           0          0           0            0          0   
5817           0           0          1           0            0          0   
5818           0           0          1           0            0          0   
5819           0           0          1           0            0          0   
5820           0           0          0           0            0          0   
5821           0           0          0           0            0          0   

      C84AINBOED  C85ABYSTAND  C86CARAVAN  
0              0            0           0  
1              0            0           0  
2              0            0           0  
3              0            0           0  
4              0            0           0  
5              0            0           0  
6              0            0           0  
7              0            0           0  
8              0            0           0  
9              0            0           0  
10             0            0           0  
11             0            0           0  
12             0            0           0  
13             0            0           0  
14             0            0           0  
15             0            0           0  
16             0            0           0  
17             0            0           0  
18             0            0           0  
19             0            0           0  
20             0            0           0  
21             0            0           0  
22             0            0           0  
23             0            0           0  
24             0            0           0  
25             0            0           0  
26             0            0           0  
27             0            0           0  
28             0            0           0  
29             0            0           0  
...          ...          ...         ...  
5792           0            0           0  
5793           0            0           0  
5794           0            0           0  
5795           0            0           0  
5796           0            0           0  
5797           0            0           1  
5798           0            0           0  
5799           0            0           0  
5800           0            0           0  
5801           0            0           0  
5802           0            0           0  
5803           0            0           0  
5804           0            0           0  
5805           0            0           0  
5806           0            0           0  
5807           0            0           0  
5808           0            0           0  
5809           0            0           0  
5810           0            0           0  
5811           0            0           0  
5812           0            0           0  
5813           0            0           0  
5814           0            0           0  
5815           0            0           0  
5816           0            0           0  
5817           0            0           0  
5818           0            0           0  
5819           0            0           1  
5820           0            0           0  
5821           0            0           0  

[5822 rows x 86 columns]

1.2. Data Understanding: Data Exploration

1.2.1 Check Missing Values

If there is any missing values, then those missing values should be handled. But it turns out that there is no missing values.

In [3]:
#check if there is null value in the raw training data and testing data or not:
print "sum of Null value in Train:", Train.isnull().sum().sum()

Train.info()
Output.info()
sum of Null value in Train: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5822 entries, 0 to 5821
Data columns (total 86 columns):
C1MOSTYPE         5822 non-null int64
C2MAANTHUI        5822 non-null int64
C3MGEMOMV         5822 non-null int64
C4MGEMLEEF        5822 non-null int64
C5MOSHOOFD        5822 non-null int64
C6MGODRK          5822 non-null int64
C7MGODPR          5822 non-null int64
C8MGODOV          5822 non-null int64
C9MGODGE          5822 non-null int64
C10MRELGE         5822 non-null int64
C11MRELSA         5822 non-null int64
C12MRELOV         5822 non-null int64
C13MFALLEEN       5822 non-null int64
C14MFGEKIND       5822 non-null int64
C15MFWEKIND       5822 non-null int64
C16MOPLHOOG       5822 non-null int64
C17MOPLMIDD       5822 non-null int64
C18MOPLLAAG       5822 non-null int64
C19MBERHOOG       5822 non-null int64
C20MBERZELF       5822 non-null int64
C21MBERBOER       5822 non-null int64
C22MBERMIDD       5822 non-null int64
C23MBERARBG       5822 non-null int64
C24MBERARBO       5822 non-null int64
C25MSKA           5822 non-null int64
C26MSKB1          5822 non-null int64
C27MSKB2          5822 non-null int64
C28MSKC           5822 non-null int64
C29MSKD           5822 non-null int64
C30MHHUUR         5822 non-null int64
C31MHKOOP         5822 non-null int64
C32MAUT1          5822 non-null int64
C33MAUT2          5822 non-null int64
C34MAUT0          5822 non-null int64
C35MZFONDS        5822 non-null int64
C36MZPART         5822 non-null int64
C37MINKMthirty    5822 non-null int64
C38MINK3045       5822 non-null int64
C39MINK4575       5822 non-null int64
C40MINK7512       5822 non-null int64
C41MINK123M       5822 non-null int64
C42MINKGEM        5822 non-null int64
C43MKOOPKLA       5822 non-null int64
C44PWAPART        5822 non-null int64
C45PWABEDR        5822 non-null int64
C46PWALAND        5822 non-null int64
C47PPERSAUT       5822 non-null int64
C48PBESAUT        5822 non-null int64
C49PMOTSCO        5822 non-null int64
C50PVRAAUT        5822 non-null int64
C51PAANHANG       5822 non-null int64
C52PTRACTOR       5822 non-null int64
C53PWERKT         5822 non-null int64
C54PBROM          5822 non-null int64
C55PLEVEN         5822 non-null int64
C56PPERSONG       5822 non-null int64
C57PGEZONG        5822 non-null int64
C58PWAOREG        5822 non-null int64
C59PBRAND         5822 non-null int64
C60PZEILPL        5822 non-null int64
C61PPLEZIER       5822 non-null int64
C62PFIETS         5822 non-null int64
C63PINBOED        5822 non-null int64
C64PBYSTAND       5822 non-null int64
C65AWAPART        5822 non-null int64
C66AWABEDR        5822 non-null int64
C67AWALAND        5822 non-null int64
C68APERSAUT       5822 non-null int64
C69ABESAUT        5822 non-null int64
C70AMOTSCO        5822 non-null int64
C71AVRAAUT        5822 non-null int64
C72AAANHANG       5822 non-null int64
C73ATRACTOR       5822 non-null int64
C74AWERKT         5822 non-null int64
C75ABROM          5822 non-null int64
C76ALEVEN         5822 non-null int64
C77APERSONG       5822 non-null int64
C78AGEZONG        5822 non-null int64
C79AWAOREG        5822 non-null int64
C80ABRAND         5822 non-null int64
C81AZEILPL        5822 non-null int64
C82APLEZIER       5822 non-null int64
C83AFIETS         5822 non-null int64
C84AINBOED        5822 non-null int64
C85ABYSTAND       5822 non-null int64
C86CARAVAN        5822 non-null int64
dtypes: int64(86)
memory usage: 3.8 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 86 columns):
C1MOSTYPE         4000 non-null int64
C2MAANTHUI        4000 non-null int64
C3MGEMOMV         4000 non-null int64
C4MGEMLEEF        4000 non-null int64
C5MOSHOOFD        4000 non-null int64
C6MGODRK          4000 non-null int64
C7MGODPR          4000 non-null int64
C8MGODOV          4000 non-null int64
C9MGODGE          4000 non-null int64
C10MRELGE         4000 non-null int64
C11MRELSA         4000 non-null int64
C12MRELOV         4000 non-null int64
C13MFALLEEN       4000 non-null int64
C14MFGEKIND       4000 non-null int64
C15MFWEKIND       4000 non-null int64
C16MOPLHOOG       4000 non-null int64
C17MOPLMIDD       4000 non-null int64
C18MOPLLAAG       4000 non-null int64
C19MBERHOOG       4000 non-null int64
C20MBERZELF       4000 non-null int64
C21MBERBOER       4000 non-null int64
C22MBERMIDD       4000 non-null int64
C23MBERARBG       4000 non-null int64
C24MBERARBO       4000 non-null int64
C25MSKA           4000 non-null int64
C26MSKB1          4000 non-null int64
C27MSKB2          4000 non-null int64
C28MSKC           4000 non-null int64
C29MSKD           4000 non-null int64
C30MHHUUR         4000 non-null int64
C31MHKOOP         4000 non-null int64
C32MAUT1          4000 non-null int64
C33MAUT2          4000 non-null int64
C34MAUT0          4000 non-null int64
C35MZFONDS        4000 non-null int64
C36MZPART         4000 non-null int64
C37MINKMthirty    4000 non-null int64
C38MINK3045       4000 non-null int64
C39MINK4575       4000 non-null int64
C40MINK7512       4000 non-null int64
C41MINK123M       4000 non-null int64
C42MINKGEM        4000 non-null int64
C43MKOOPKLA       4000 non-null int64
C44PWAPART        4000 non-null int64
C45PWABEDR        4000 non-null int64
C46PWALAND        4000 non-null int64
C47PPERSAUT       4000 non-null int64
C48PBESAUT        4000 non-null int64
C49PMOTSCO        4000 non-null int64
C50PVRAAUT        4000 non-null int64
C51PAANHANG       4000 non-null int64
C52PTRACTOR       4000 non-null int64
C53PWERKT         4000 non-null int64
C54PBROM          4000 non-null int64
C55PLEVEN         4000 non-null int64
C56PPERSONG       4000 non-null int64
C57PGEZONG        4000 non-null int64
C58PWAOREG        4000 non-null int64
C59PBRAND         4000 non-null int64
C60PZEILPL        4000 non-null int64
C61PPLEZIER       4000 non-null int64
C62PFIETS         4000 non-null int64
C63PINBOED        4000 non-null int64
C64PBYSTAND       4000 non-null int64
C65AWAPART        4000 non-null int64
C66AWABEDR        4000 non-null int64
C67AWALAND        4000 non-null int64
C68APERSAUT       4000 non-null int64
C69ABESAUT        4000 non-null int64
C70AMOTSCO        4000 non-null int64
C71AVRAAUT        4000 non-null int64
C72AAANHANG       4000 non-null int64
C73ATRACTOR       4000 non-null int64
C74AWERKT         4000 non-null int64
C75ABROM          4000 non-null int64
C76ALEVEN         4000 non-null int64
C77APERSONG       4000 non-null int64
C78AGEZONG        4000 non-null int64
C79AWAOREG        4000 non-null int64
C80ABRAND         4000 non-null int64
C81AZEILPL        4000 non-null int64
C82APLEZIER       4000 non-null int64
C83AFIETS         4000 non-null int64
C84AINBOED        4000 non-null int64
C85ABYSTAND       4000 non-null int64
C86CARAVAN        0 non-null float64
dtypes: float64(1), int64(85)
memory usage: 2.6 MB

1.2.2 Check the data coding quality and data group internal correlation

  • Motivation: If the Data is clean and reasonable coded, then the corelation of Religion 6_9 should be negative to each other, the same or similar for Relationship10_13, Children14_15,Education16_18,Job19_24,SocialClass24_29,HauseOwnership30_31,CarNumber32_34,Income37_41. Since those values can normally not exist at the same time. Thouse fields should actually be
  • Result: the data is not normally coded. Since thouse data fields belongs to the sociodemographic variables, which belongs to sensitive data of the customers. So those data are normally coded using some codeing methods. After online research I knows that they are ZIP encoded. So I would say that thouse sociodemographic variables are not helpful for the models, they would be noises, unless I can corruppt the coding.
In [4]:
Religion6_9 = Train[['C6MGODRK','C7MGODPR', 'C8MGODOV', 'C9MGODGE']]

Relationship10_13 = Train[["C10MRELGE", "C11MRELSA", "C12MRELOV","C13MFALLEEN"]]

Children14_15 = Train[["C14MFGEKIND", "C15MFWEKIND"]]

Education16_18 = Train[["C16MOPLHOOG", "C17MOPLMIDD", "C18MOPLLAAG"]]

Job19_24 =  Train[["C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD", 
                                       "C23MBERARBG", "C24MBERARBO"]]

SocialClass24_29 = Train[["C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC", 
                                       "C29MSKD"]]

HauseOwnership30_31 = Train[["C30MHHUUR","C31MHKOOP"]]

CarNumber32_34 =  Train[["C32MAUT1", "C33MAUT2","C34MAUT0"]]

Income37_41 = Train[["C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M"]]

print Religion6_9 ,'\n', Relationship10_13, '\n', Children14_15,'\n', Education16_18, '\n',Job19_24,'\n',SocialClass24_29,'\n',HauseOwnership30_31,'\n',CarNumber32_34,'\n',Income37_41,'\n',Train[['C43MKOOPKLA']],'\n',Train[['C42MINKGEM']]

##Todo: Feature Representaition: Transform attributes, depending of model structure  For instance, linear model compute inner product of attributes and model parameters.  All attributes have to be numeric.  Larger attribute values: larger value of inner product  Categorial attributes, attributes without ordering, textual attributes have to be converted.
      C6MGODRK  C7MGODPR  C8MGODOV  C9MGODGE
0            0         5         1         3
1            1         4         1         4
2            0         4         2         4
3            2         3         2         4
4            1         4         1         4
5            0         5         0         5
6            2         2         0         5
7            0         7         0         2
8            0         1         3         6
9            3         5         0         2
10           1         4         1         4
11           1         3         2         4
12           1         4         1         4
13           0         5         0         4
14           0         6         1         2
15           0         7         0         2
16           0         6         0         3
17           0         5         0         4
18           2         4         0         3
19           0         2         0         7
20           0         6         0         3
21           0         4         2         3
22           1         7         0         2
23           0         7         0         2
24           0         4         2         4
25           0         6         1         2
26           0         5         0         4
27           0         3         0         6
28           0         9         0         0
29           0         7         1         2
...        ...       ...       ...       ...
5792         1         5         1         3
5793         0         4         1         4
5794         0         2         4         4
5795         0         3         1         6
5796         0         5         1         4
5797         0         6         1         2
5798         1         5         2         3
5799         0         5         0         4
5800         2         3         0         5
5801         0         6         1         3
5802         1         4         1         4
5803         0         6         0         3
5804         2         4         1         3
5805         0         6         1         2
5806         0         6         1         2
5807         0         9         0         0
5808         2         2         0         5
5809         0         7         0         2
5810         0         4         0         5
5811         1         5         1         3
5812         0         7         0         2
5813         1         6         1         2
5814         1         4         2         3
5815         0         2         0         7
5816         0         6         0         3
5817         0         6         1         2
5818         1         4         1         4
5819         0         6         0         3
5820         0         7         0         2
5821         0         6         1         2

[5822 rows x 4 columns] 
      C10MRELGE  C11MRELSA  C12MRELOV  C13MFALLEEN
0             7          0          2            1
1             6          2          2            0
2             3          2          4            4
3             5          2          2            2
4             7          1          2            2
5             0          6          3            3
6             7          2          0            0
7             7          2          0            0
8             6          0          3            3
9             7          0          2            2
10            7          1          2            0
11            7          1          2            2
12            6          2          3            3
13            7          1          1            1
14            1          2          6            5
15            7          2          0            0
16            7          0          2            0
17            7          0          2            0
18            7          0          2            1
19            9          0          0            0
20            9          0          0            0
21            7          0          2            0
22            7          0          2            1
23            7          2          0            0
24            8          1          1            1
25            6          0          3            2
26            7          2          0            0
27            9          0          0            0
28            5          0          4            0
29            5          1          4            4
...         ...        ...        ...          ...
5792          7          2          1            1
5793          6          1          3            1
5794          8          1          1            1
5795          6          2          1            1
5796          7          1          2            1
5797          7          1          1            0
5798          5          1          3            5
5799          7          0          2            2
5800          4          2          4            3
5801          7          0          2            3
5802          8          0          1            2
5803          8          0          1            0
5804          3          3          4            4
5805          6          0          3            2
5806          7          1          1            0
5807          9          0          0            0
5808          6          1          3            2
5809          4          0          5            5
5810          9          0          0            0
5811          7          1          2            2
5812          6          0          3            2
5813          5          1          3            2
5814          7          1          1            1
5815          5          2          3            3
5816          6          0          3            0
5817          1          2          6            5
5818          6          0          3            2
5819          5          1          4            3
5820          7          2          0            0
5821          7          1          2            1

[5822 rows x 4 columns] 
      C14MFGEKIND  C15MFWEKIND
0               2            6
1               4            5
2               4            2
3               3            4
4               4            4
5               5            2
6               3            6
7               5            4
8               3            3
9               2            6
10              3            6
11              3            5
12              4            3
13              4            5
14              3            1
15              5            4
16              6            3
17              2            7
18              3            6
19              6            3
20              3            6
21              2            7
22              3            6
23              4            5
24              3            6
25              3            5
26              3            6
27              4            5
28              0            9
29              1            5
...           ...          ...
5792            3            5
5793            4            5
5794            4            5
5795            4            4
5796            1            8
5797            3            6
5798            4            1
5799            4            3
5800            2            5
5801            2            5
5802            5            3
5803            3            6
5804            3            2
5805            3            5
5806            4            5
5807            4            5
5808            2            6
5809            1            3
5810            3            6
5811            3            5
5812            2            5
5813            4            3
5814            4            4
5815            3            5
5816            7            2
5817            3            2
5818            2            5
5819            3            4
5820            4            5
5821            4            4

[5822 rows x 2 columns] 
      C16MOPLHOOG  C17MOPLMIDD  C18MOPLLAAG
0               1            2            7
1               0            5            4
2               0            5            4
3               3            4            2
4               5            4            0
5               0            5            4
6               0            4            5
7               0            3            6
8               0            1            8
9               0            4            5
10              4            3            3
11              1            7            1
12              1            4            5
13              2            4            4
14              2            6            2
15              0            3            6
16              2            6            2
17              2            1            7
18              5            4            1
19              0            0            9
20              0            0            9
21              0            2            7
22              3            5            1
23              0            2            7
24              1            7            2
25              1            2            6
26              3            5            2
27              2            0            7
28              0            0            9
29              0            2            7
...           ...          ...          ...
5792            4            4            2
5793            1            3            5
5794            0            5            4
5795            2            2            5
5796            0            1            8
5797            5            3            3
5798            4            5            1
5799            0            7            2
5800            0            4            5
5801            1            6            3
5802            3            3            4
5803            2            5            3
5804            2            4            4
5805            1            2            6
5806            1            6            3
5807            1            0            8
5808            0            3            6
5809            1            2            7
5810            0            4            5
5811            1            5            3
5812            0            5            5
5813            1            4            4
5814            4            4            2
5815            2            4            4
5816            0            7            2
5817            2            5            2
5818            0            0            9
5819            0            1            8
5820            0            2            7
5821            1            2            6

[5822 rows x 3 columns] 
      C19MBERHOOG  C20MBERZELF  C21MBERBOER  C22MBERMIDD  C23MBERARBG  \
0               1            0            1            2            5   
1               0            0            0            5            0   
2               0            0            0            7            0   
3               4            0            0            3            1   
4               0            5            4            0            0   
5               2            0            0            4            2   
6               0            0            0            4            1   
7               2            0            0            2            5   
8               1            1            0            1            8   
9               2            0            0            3            3   
10              0            0            0            9            0   
11              4            0            0            5            1   
12              1            1            0            3            2   
13              3            0            1            2            2   
14              1            0            0            4            3   
15              2            0            0            2            5   
16              2            0            0            4            0   
17              0            2            0            1            1   
18              6            0            0            3            0   
19              0            0            0            2            4   
20              0            0            3            0            6   
21              0            0            2            4            0   
22              6            0            0            2            0   
23              0            2            0            2            4   
24              4            0            0            3            3   
25              1            0            1            4            2   
26              1            0            0            5            2   
27              2            0            5            0            2   
28              0            0            2            0            3   
29              0            0            1            2            6   
...           ...          ...          ...          ...          ...   
5792            3            1            1            4            2   
5793            3            0            1            4            1   
5794            0            0            0            4            1   
5795            2            0            0            3            2   
5796            1            0            0            2            4   
5797            0            0            0            4            3   
5798            5            0            0            4            0   
5799            0            0            0            7            0   
5800            0            0            0            2            2   
5801            3            1            1            2            2   
5802            5            1            0            2            1   
5803            3            0            2            3            2   
5804            1            0            0            2            3   
5805            1            0            1            4            2   
5806            0            0            0            1            4   
5807            0            0            1            2            3   
5808            1            0            2            0            4   
5809            0            0            0            2            3   
5810            0            0            0            2            5   
5811            3            0            3            1            2   
5812            3            0            5            1            0   
5813            2            1            0            4            1   
5814            6            0            0            2            1   
5815            2            0            0            2            2   
5816            0            0            0            5            2   
5817            2            0            0            4            1   
5818            2            1            1            3            3   
5819            1            0            0            2            3   
5820            0            2            0            2            4   
5821            1            0            1            3            2   

      C24MBERARBO  
0               2  
1               4  
2               2  
3               2  
4               0  
5               2  
6               5  
7               2  
8               1  
9               3  
10              0  
11              1  
12              4  
13              2  
14              2  
15              2  
16              4  
17              5  
18              1  
19              4  
20              0  
21              3  
22              1  
23              2  
24              0  
25              4  
26              2  
27              0  
28              5  
29              1  
...           ...  
5792            1  
5793            2  
5794            5  
5795            3  
5796            4  
5797            3  
5798            0  
5799            2  
5800            5  
5801            1  
5802            1  
5803            1  
5804            4  
5805            4  
5806            5  
5807            4  
5808            4  
5809            5  
5810            3  
5811            2  
5812            1  
5813            3  
5814            1  
5815            4  
5816            3  
5817            3  
5818            2  
5819            5  
5820            2  
5821            4  

[5822 rows x 6 columns] 
      C24MBERARBO  C25MSKA  C26MSKB1  C27MSKB2  C28MSKC  C29MSKD
0               2        1         1         2        6        1
1               4        0         2         3        5        0
2               2        0         5         0        4        0
3               2        3         2         1        4        0
4               0        9         0         0        0        0
5               2        2         2         2        4        2
6               5        0         1         4        5        0
7               2        2         1         2        5        2
8               1        1         1         0        8        1
9               3        1         2         1        4        2
10              0        3         0         6        0        0
11              1        2         3         4        1        0
12              4        1         2         2        5        1
13              2        4         2         1        4        0
14              2        1         3         2        4        0
15              2        2         1         2        5        2
16              4        2         2         4        2        0
17              5        2         0         0        7        0
18              1        5         2         1        2        0
19              4        0         0         0        7        2
20              0        0         0         3        6        0
21              3        0         0         5        4        0
22              1        6         1         3        1        0
23              2        0         0         4        5        0
24              0        1         3         3        3        0
25              4        1         2         2        5        2
26              2        1         3         3        2        1
27              0        2         1         5        3        0
28              5        0         4         0        5        0
29              1        0         2         1        7        1
...           ...      ...       ...       ...      ...      ...
5792            1        3         3         2        2        1
5793            2        1         2         4        3        1
5794            5        0         3         2        4        0
5795            3        2         1         2        5        1
5796            4        1         1         1        5        4
5797            3        4         0         3        3        0
5798            0        5         4         1        0        1
5799            2        0         3         5        2        0
5800            5        0         2         3        3        2
5801            1        1         1         4        4        0
5802            1        3         2         2        3        1
5803            1        3         1         3        3        1
5804            4        1         2         3        5        1
5805            4        1         2         2        5        2
5806            5        1         0         1        8        0
5807            4        1         0         1        6        2
5808            4        0         0         5        4        1
5809            5        0         0         3        6        0
5810            3        0         2         2        5        2
5811            2        3         2         3        3        1
5812            1        3         1         5        1        0
5813            3        2         2         3        4        1
5814            1        4         2         3        2        0
5815            4        2         2         1        4        2
5816            3        0         2         3        5        0
5817            3        2         3         3        3        0
5818            2        0         4         5        0        0
5819            5        1         1         1        4        4
5820            2        0         0         4        5        0
5821            4        1         1         2        6        1

[5822 rows x 6 columns] 
      C30MHHUUR  C31MHKOOP
0             1          8
1             2          7
2             7          2
3             5          4
4             4          5
5             9          0
6             6          3
7             0          9
8             9          0
9             0          9
10            0          9
11            6          3
12            5          4
13            5          4
14            9          0
15            0          9
16            6          3
17            4          5
18            4          5
19            9          0
20            9          0
21            0          9
22            4          5
23            2          7
24            1          8
25            2          7
26            8          1
27            2          7
28            9          0
29            8          1
...         ...        ...
5792          1          8
5793          2          7
5794          5          4
5795          8          1
5796          8          1
5797          3          6
5798          8          1
5799          9          0
5800          9          0
5801          3          6
5802          3          6
5803          3          6
5804          8          1
5805          2          7
5806          5          4
5807          6          3
5808          7          2
5809          3          6
5810          0          9
5811          3          6
5812          0          9
5813          3          6
5814          0          9
5815          9          0
5816          0          9
5817          9          0
5818          3          6
5819          7          2
5820          2          7
5821          5          4

[5822 rows x 2 columns] 
      C32MAUT1  C33MAUT2  C34MAUT0
0            8         0         1
1            7         1         2
2            7         0         2
3            9         0         0
4            6         2         1
5            5         3         3
6            8         0         1
7            4         4         2
8            5         2         3
9            6         1         2
10           6         2         1
11           7         1         2
12           6         1         3
13           7         2         0
14           5         1         3
15           4         4         2
16           7         2         0
17           6         1         2
18           7         1         2
19           7         2         0
20           7         2         0
21           7         0         2
22           7         2         1
23           5         4         0
24           8         0         1
25           7         0         2
26           9         0         0
27           7         2         0
28           3         2         4
29           5         1         4
...        ...       ...       ...
5792         6         3         1
5793         4         0         5
5794         8         1         0
5795         7         1         2
5796         6         1         2
5797         8         1         1
5798         6         1         3
5799         9         0         0
5800         3         0         6
5801         6         2         2
5802         6         3         1
5803         7         2         0
5804         6         0         3
5805         7         0         2
5806         8         1         1
5807         5         0         4
5808         6         0         3
5809         5         0         4
5810         9         0         0
5811         5         3         1
5812         3         3         3
5813         6         1         3
5814         6         3         0
5815         6         0         3
5816         5         2         2
5817         5         1         3
5818         6         1         2
5819         4         0         5
5820         5         4         0
5821         5         2         3

[5822 rows x 3 columns] 
      C37MINKMthirty  C38MINK3045  C39MINK4575  C40MINK7512  C41MINK123M
0                  0            4            5            0            0
1                  2            0            5            2            0
2                  4            5            0            0            0
3                  1            5            3            0            0
4                  0            0            9            0            0
5                  5            2            3            0            0
6                  4            3            3            0            0
7                  2            5            3            0            0
8                  7            2            1            0            0
9                  2            3            3            1            0
10                 0            3            2            2            2
11                 3            4            3            1            0
12                 3            4            3            1            0
13                 3            2            4            1            0
14                 4            3            3            0            0
15                 2            5            3            0            0
16                 0            1            6            2            0
17                 0            6            3            0            0
18                 2            2            6            0            0
19                 5            4            0            0            0
20                 3            5            2            0            0
21                 3            6            0            0            0
22                 1            1            8            0            0
23                 0            5            4            0            0
24                 0            5            4            0            0
25                 0            4            5            0            0
26                 2            3            3            2            0
27                 4            2            4            0            0
28                 9            0            0            0            0
29                 5            3            2            0            0
...              ...          ...          ...          ...          ...
5792               2            4            4            1            0
5793               2            2            2            4            1
5794               8            1            1            0            0
5795               2            6            2            0            0
5796               2            6            2            0            0
5797               0            1            5            4            0
5798               5            4            0            0            0
5799               4            5            0            0            0
5800               7            2            0            0            0
5801               5            2            2            2            1
5802               2            1            4            2            2
5803               2            4            3            0            0
5804               6            3            1            0            1
5805               0            4            5            0            0
5806               0            4            5            1            0
5807               2            6            1            1            0
5808               7            2            0            0            0
5809               5            2            3            0            0
5810               1            5            4            0            0
5811               3            3            3            1            0
5812               2            5            3            0            0
5813               3            4            3            1            0
5814               1            3            4            2            0
5815               3            4            3            0            0
5816               0            4            4            2            0
5817               4            3            3            0            0
5818               0            9            0            0            0
5819               5            3            1            1            0
5820               0            5            4            0            0
5821               2            5            2            1            0

[5822 rows x 5 columns] 
      C43MKOOPKLA
0               3
1               4
2               4
3               4
4               3
5               3
6               5
7               3
8               3
9               7
10              7
11              4
12              3
13              4
14              3
15              3
16              4
17              2
18              6
19              1
20              3
21              3
22              6
23              6
24              6
25              3
26              4
27              3
28              1
29              3
...           ...
5792            6
5793            2
5794            3
5795            2
5796            5
5797            7
5798            1
5799            4
5800            1
5801            6
5802            6
5803            3
5804            3
5805            3
5806            4
5807            4
5808            1
5809            5
5810            3
5811            5
5812            3
5813            6
5814            8
5815            6
5816            6
5817            3
5818            5
5819            3
5820            6
5821            3

[5822 rows x 1 columns] 
      C42MINKGEM
0              4
1              5
2              3
3              4
4              6
5              3
6              3
7              3
8              2
9              4
10             8
11             3
12             3
13             4
14             3
15             3
16             5
17             4
18             4
19             3
20             3
21             3
22             5
23             4
24             4
25             4
26             5
27             3
28             1
29             3
...          ...
5792           4
5793           6
5794           2
5795           3
5796           3
5797           7
5798           3
5799           3
5800           2
5801           4
5802           7
5803           4
5804           2
5805           4
5806           4
5807           4
5808           2
5809           2
5810           4
5811           4
5812           3
5813           4
5814           5
5815           4
5816           5
5817           3
5818           4
5819           3
5820           4
5821           3

[5822 rows x 1 columns]

1.2.3 Data/Feature correlation

1.2.3.1 Data/Feature correlation of all data sets

This is also double check for 1.2.2: If the data of the sociodemographic variables are properly coded in the raw data, then the table should display lots of nagative -1 values for the sociodemographic variables.

In [5]:
Train.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
Out[5]:
C1MOSTYPE C2MAANTHUI C3MGEMOMV C4MGEMLEEF C5MOSHOOFD C6MGODRK C7MGODPR C8MGODOV C9MGODGE C10MRELGE C11MRELSA C12MRELOV C13MFALLEEN C14MFGEKIND C15MFWEKIND C16MOPLHOOG C17MOPLMIDD C18MOPLLAAG C19MBERHOOG C20MBERZELF C21MBERBOER C22MBERMIDD C23MBERARBG C24MBERARBO C25MSKA C26MSKB1 C27MSKB2 C28MSKC C29MSKD C30MHHUUR C31MHKOOP C32MAUT1 C33MAUT2 C34MAUT0 C35MZFONDS C36MZPART C37MINKMthirty C38MINK3045 C39MINK4575 C40MINK7512 C41MINK123M C42MINKGEM C43MKOOPKLA C44PWAPART C45PWABEDR C46PWALAND C47PPERSAUT C48PBESAUT C49PMOTSCO C50PVRAAUT C51PAANHANG C52PTRACTOR C53PWERKT C54PBROM C55PLEVEN C56PPERSONG C57PGEZONG C58PWAOREG C59PBRAND C60PZEILPL C61PPLEZIER C62PFIETS C63PINBOED C64PBYSTAND C65AWAPART C66AWABEDR C67AWALAND C68APERSAUT C69ABESAUT C70AMOTSCO C71AVRAAUT C72AAANHANG C73ATRACTOR C74AWERKT C75ABROM C76ALEVEN C77APERSONG C78AGEZONG C79AWAOREG C80ABRAND C81AZEILPL C82APLEZIER C83AFIETS C84AINBOED C85ABYSTAND C86CARAVAN
C1MOSTYPE 1.0 -0.039 -0.022 0.0095 0.99 -0.19 0.09 -0.026 -0.02 -0.066 -0.016 0.087 0.029 -0.066 0.023 -0.47 -0.28 0.53 -0.42 -0.12 0.25 -0.21 0.3 0.27 -0.39 -0.19 -0.038 0.39 0.2 0.15 -0.15 -0.18 0.038 0.16 0.35 -0.35 0.2 0.16 -0.26 -0.19 -0.14 -0.3 -0.57 -0.04 -0.0099 0.076 -0.0085 -0.028 -0.0036 0.018 0.032 0.097 0.032 0.025 -0.043 0.028 -0.015 0.0014 -0.0098 0.014 -0.018 -0.012 -0.017 -0.059 -0.033 -0.0041 0.08 -0.0083 -0.028 -0.013 0.019 0.03 0.092 0.02 0.032 -0.029 0.022 -0.012 -0.0029 -0.012 0.0078 -0.018 -0.016 -0.021 -0.054 -0.069
C2MAANTHUI -0.039 1.0 0.01 0.057 -0.046 -0.0061 -0.024 0.012 0.021 0.023 -0.039 -0.013 0.033 -0.083 0.042 0.0034 -0.053 0.037 -0.0057 0.042 -0.018 0.0058 0.025 -0.052 -0.014 0.0099 -0.025 -0.0073 0.021 -0.049 0.048 -0.011 0.024 -0.011 -0.016 0.015 0.015 -0.024 0.015 0.0019 -0.029 -0.0094 -0.024 0.041 -0.0032 -0.028 -0.026 -0.02 -0.016 -0.011 -0.011 -0.031 -0.0081 -0.014 0.019 0.0084 0.011 -0.017 -0.0014 -0.0054 0.0044 -0.017 0.0081 -0.0038 0.043 0.0047 -0.028 -0.029 -0.019 -0.019 -0.0097 -0.01 -0.028 -0.0068 -0.013 0.021 0.0033 0.0094 -0.016 0.00065 -0.0062 0.00067 -0.021 0.018 -0.0042 -0.0098
C3MGEMOMV -0.022 0.01 1.0 -0.33 0.016 0.013 0.049 -0.11 -0.0055 0.53 -0.18 -0.5 -0.66 -0.32 0.79 0.0081 0.04 -0.039 0.024 0.036 0.11 -0.0049 0.038 -0.16 0.063 0.012 0.059 -0.019 -0.19 -0.34 0.34 0.25 0.2 -0.4 -0.073 0.074 -0.37 0.044 0.26 0.1 0.088 0.28 0.32 -0.042 0.027 0.034 0.021 -0.0032 0.022 -0.0003 0.012 0.059 0.032 0.023 0.026 0.0038 0.016 0.017 0.056 0.008 0.0019 0.031 0.013 0.035 -0.044 0.035 0.036 0.021 -0.009 0.013 0.00061 0.015 0.059 0.027 0.019 0.017 -0.00013 0.017 0.019 -0.0093 0.0092 0.00064 0.03 0.026 0.028 0.036
C4MGEMLEEF 0.0095 0.057 -0.33 1.0 0.0039 -0.038 0.094 0.058 -0.12 -0.04 -0.31 0.2 0.24 0.2 -0.36 -0.026 -0.22 0.18 0.14 0.058 0.091 -0.16 -0.048 0.034 0.048 -0.16 -0.04 -0.064 0.24 0.025 -0.023 -0.13 -0.089 0.2 -0.12 0.12 0.18 -0.054 -0.15 0.028 0.018 -0.076 -0.15 -0.018 -0.0046 0.0079 -0.015 0.019 -0.017 -0.0099 0.013 -0.00045 0.0034 -0.0039 -0.029 0.02 -0.0068 -0.0016 0.019 0.00021 -0.0047 0.019 -0.014 -0.019 -0.014 -0.0082 0.0075 -0.016 0.022 -0.016 -0.013 0.0061 -0.0055 0.0056 0.00047 -0.025 0.021 -0.0017 0.00064 0.024 0.00024 -0.0018 0.021 -0.02 -0.015 0.0045
C5MOSHOOFD 0.99 -0.046 0.016 0.0039 1.0 -0.2 0.098 -0.035 -0.021 -0.027 -0.038 0.053 -0.0034 -0.075 0.059 -0.47 -0.28 0.52 -0.41 -0.12 0.28 -0.22 0.31 0.26 -0.38 -0.19 -0.03 0.38 0.19 0.12 -0.12 -0.17 0.063 0.13 0.34 -0.34 0.18 0.16 -0.24 -0.18 -0.13 -0.28 -0.54 -0.049 -0.0063 0.086 -0.0057 -0.027 -0.0035 0.021 0.036 0.11 0.037 0.03 -0.04 0.031 -0.013 0.0014 -0.00032 0.015 -0.021 -0.015 -0.017 -0.057 -0.042 -0.00069 0.09 -0.0056 -0.029 -0.012 0.022 0.034 0.11 0.022 0.036 -0.027 0.024 -0.01 -0.0022 -0.012 0.0071 -0.021 -0.018 -0.021 -0.052 -0.069
C6MGODRK -0.19 -0.0061 0.013 -0.038 -0.2 1.0 -0.37 0.021 -0.064 -0.029 0.12 -0.0035 0.016 -0.01 -0.0075 0.24 0.15 -0.25 0.22 0.057 -0.11 0.088 -0.18 -0.072 0.17 0.14 0.08 -0.22 -0.075 -0.096 0.096 -0.02 0.037 0.0049 -0.22 0.22 -0.093 -0.11 0.11 0.19 0.22 0.18 0.15 0.053 -0.019 -0.054 -0.00025 0.0056 0.013 -0.0058 -0.0096 -0.049 -0.0052 -0.02 0.0014 -0.008 0.008 -0.014 -0.0039 0.0099 0.013 0.0018 -0.002 0.00045 0.046 -0.027 -0.055 -0.0027 0.0073 0.0094 -0.011 -0.0039 -0.044 -0.0042 -0.024 0.0038 -0.0084 0.0054 -0.013 0.0019 -0.00068 0.012 -0.0015 -0.011 -0.004 0.0062
C7MGODPR 0.09 -0.024 0.049 0.094 0.098 -0.37 1.0 -0.32 -0.74 0.15 -0.21 -0.084 -0.12 0.066 0.024 -0.094 -0.016 0.056 -0.011 0.043 0.11 -0.032 0.017 -0.035 -0.011 -0.041 0.029 0.058 -0.07 -0.22 0.22 -0.018 0.11 -0.082 -0.042 0.046 -0.063 0.059 0.037 -0.061 -0.095 -0.011 0.017 -0.014 0.024 0.037 0.019 0.0073 -0.0044 0.027 0.02 0.044 0.011 0.014 -0.012 -0.02 0.00063 0.0064 0.077 0.0089 0.018 0.0056 0.0014 0.019 -0.014 0.025 0.04 0.028 0.0098 -0.003 0.025 0.016 0.043 0.0076 0.016 -0.018 -0.014 0.0039 0.004 0.046 0.014 0.018 0.0019 0.0024 0.017 0.033
C8MGODOV -0.026 0.012 -0.11 0.058 -0.035 0.021 -0.32 1.0 -0.14 -0.13 0.11 0.11 0.12 0.023 -0.11 0.0024 0.013 -0.006 -0.052 0.0063 -0.047 0.078 -0.066 0.11 -0.033 0.054 -0.026 0.0017 0.068 0.15 -0.15 -0.0091 -0.1 0.11 0.11 -0.12 0.0087 0.029 -0.0085 -0.015 -0.048 -0.039 -0.068 0.019 -0.0034 -0.019 -0.023 0.009 -0.0014 -0.0068 -0.016 -0.033 0.0012 -0.031 -0.019 -0.00048 -0.0046 0.006 -0.02 -0.021 0.0051 0.027 -0.012 0.0098 0.017 -0.01 -0.017 -0.025 0.0062 0.0017 -0.0051 -0.016 -0.029 0.00066 -0.025 0.0031 0.0066 -0.0056 0.018 0.0063 -0.024 0.0094 0.026 -0.0097 0.01 0.004
C9MGODGE -0.02 0.021 -0.0055 -0.12 -0.021 -0.064 -0.74 -0.14 1.0 -0.11 0.16 0.072 0.065 -0.087 0.032 -0.0031 -0.037 0.043 -0.057 -0.067 -0.052 -0.024 0.083 0.024 -0.053 -0.023 -0.042 0.031 0.083 0.22 -0.23 0.013 -0.063 0.055 0.1 -0.1 0.11 -0.026 -0.081 0.0012 0.049 -0.052 -0.057 -0.021 -0.014 -0.014 -0.018 -0.015 -0.0026 -0.023 -0.0079 -0.013 -0.012 0.011 0.012 0.017 0.0061 -0.0013 -0.074 -0.011 -0.024 -0.018 0.006 -0.032 -0.018 -0.009 -0.017 -0.024 -0.019 -0.0037 -0.018 -0.005 -0.014 -0.0081 0.0085 0.01 0.0088 0.0042 -0.0041 -0.053 -0.0084 -0.026 -0.011 0.0073 -0.027 -0.042
C10MRELGE -0.066 0.023 0.53 -0.04 -0.027 -0.029 0.15 -0.13 -0.11 1.0 -0.48 -0.88 -0.68 0.075 0.5 0.075 0.027 -0.082 0.18 0.088 0.02 -0.0072 0.027 -0.3 0.18 -0.029 0.017 -0.036 -0.24 -0.39 0.39 0.42 0.23 -0.61 -0.23 0.23 -0.44 0.086 0.25 0.18 0.064 0.35 0.33 -0.039 0.015 0.008 0.026 -0.0091 0.021 0.0073 0.015 0.016 0.015 0.014 0.032 0.017 0.021 0.013 0.058 0.013 -0.011 0.018 -0.0087 0.036 -0.043 0.018 0.01 0.028 -0.011 0.013 0.0095 0.015 0.021 0.015 0.015 0.015 0.012 0.022 0.012 -0.008 0.0097 -0.0016 0.021 -0.0034 0.037 0.07
C11MRELSA -0.016 -0.039 -0.18 -0.31 -0.038 0.12 -0.21 0.11 0.16 -0.48 1.0 0.085 0.098 0.17 -0.21 -0.015 0.14 -0.079 -0.093 -0.011 -0.09 0.15 -0.026 0.056 -0.092 0.17 0.077 -0.011 -0.048 0.16 -0.16 -0.093 0.031 0.11 0.15 -0.15 0.067 0.02 -0.038 0.018 0.031 -0.05 -0.062 0.013 -0.0058 -0.0019 0.0033 -0.01 0.016 0.0061 -0.011 -0.0083 -0.024 0.0019 0.0022 -0.026 -0.0034 -0.01 -0.039 -0.018 0.021 -0.036 0.019 -0.011 0.014 -0.0026 -0.0013 -0.00025 -0.0081 0.019 0.0099 -0.0078 -0.006 -0.026 0.0018 0.0037 -0.021 -0.0079 -0.011 -0.00047 -0.021 0.02 -0.037 0.018 -0.014 -0.033
C12MRELOV 0.087 -0.013 -0.5 0.2 0.053 -0.0035 -0.084 0.11 0.072 -0.88 0.085 1.0 0.75 -0.19 -0.45 -0.082 -0.086 0.13 -0.15 -0.074 0.039 -0.056 -0.017 0.31 -0.17 -0.023 -0.046 0.037 0.31 0.37 -0.37 -0.43 -0.28 0.66 0.18 -0.19 0.47 -0.11 -0.27 -0.18 -0.07 -0.37 -0.34 0.036 -0.0098 -0.0082 -0.033 0.018 -0.026 -0.0061 -0.0086 -0.0098 -0.0057 -0.019 -0.034 -0.0096 -0.024 -0.0063 -0.042 -0.0079 0.0026 -0.0049 0.0037 -0.041 0.041 -0.017 -0.01 -0.033 0.02 -0.018 -0.011 -0.0097 -0.014 -0.006 -0.019 -0.016 -0.0069 -0.024 -0.0037 0.013 -0.0038 -0.0063 -0.0079 -0.0026 -0.04 -0.062
C13MFALLEEN 0.029 0.033 -0.66 0.24 -0.0034 0.016 -0.12 0.12 0.065 -0.68 0.098 0.75 1.0 -0.21 -0.63 -0.0049 -0.036 0.045 -0.06 -0.058 -0.012 -0.0021 -0.035 0.16 -0.096 0.055 -0.026 -0.054 0.26 0.35 -0.35 -0.33 -0.24 0.52 0.12 -0.12 0.48 -0.12 -0.27 -0.17 -0.078 -0.36 -0.3 0.044 -0.015 -0.029 -0.045 0.012 -0.022 0.0012 -0.01 -0.032 -0.0069 -0.034 -0.029 -0.0023 -0.034 -0.0065 -0.052 -0.0097 -0.012 -0.0088 -0.0027 -0.037 0.047 -0.022 -0.031 -0.044 0.018 -0.011 -0.0023 -0.01 -0.034 -0.0046 -0.031 -0.014 0.0019 -0.033 -0.0049 0.012 -0.011 -0.019 -0.01 -0.014 -0.034 -0.053
C14MFGEKIND -0.066 -0.083 -0.32 0.2 -0.075 -0.01 0.066 0.023 -0.087 0.075 0.17 -0.19 -0.21 1.0 -0.59 0.05 0.11 -0.13 0.13 0.11 0.0022 0.027 -0.089 -0.1 0.14 0.017 0.014 -0.062 -0.13 -0.048 0.053 0.11 -0.012 -0.11 -0.083 0.088 -0.1 0.036 0.074 0.039 0.011 0.11 -0.027 -0.014 0.00097 0.0074 0.00075 -0.0021 0.0019 0.0075 0.017 -0.0058 -0.003 0.008 -0.0026 -0.0048 0.0085 -0.0021 0.012 0.0045 0.015 -0.006 -0.011 -0.00021 -0.018 -0.00064 0.01 0.0019 0.00077 -0.00049 0.014 0.014 -6.4e-05 0.0015 0.012 -0.02 -0.0046 0.0095 -0.0058 0.0099 0.011 0.022 -0.0033 -0.01 0.0017 0.008
C15MFWEKIND 0.023 0.042 0.79 -0.36 0.059 -0.0075 0.024 -0.11 0.032 0.5 -0.21 -0.45 -0.63 -0.59 1.0 -0.036 -0.049 0.063 -0.044 -0.035 0.028 -0.033 0.11 -0.059 -0.015 -0.058 -0.0038 0.095 -0.098 -0.24 0.24 0.18 0.21 -0.34 -0.034 0.032 -0.3 0.064 0.17 0.099 0.052 0.21 0.27 -0.034 0.014 0.019 0.033 -0.0086 0.016 -0.0075 -0.0047 0.036 0.0083 0.02 0.026 0.0061 0.016 0.0061 0.033 0.0089 -0.012 0.0081 0.0091 0.023 -0.033 0.019 0.019 0.029 -0.017 0.013 -0.0094 -0.002 0.031 0.005 0.017 0.025 0.00082 0.014 0.0098 -0.021 0.0041 -0.01 0.0083 0.017 0.02 0.032
C16MOPLHOOG -0.47 0.0034 0.0081 -0.026 -0.47 0.24 -0.094 0.0024 -0.0031 0.075 -0.015 -0.082 -0.0049 0.05 -0.036 1.0 0.0075 -0.64 0.56 0.27 -0.13 0.041 -0.34 -0.31 0.69 0.18 -0.019 -0.53 -0.26 -0.23 0.23 0.14 0.01 -0.16 -0.52 0.52 -0.26 -0.29 0.35 0.31 0.29 0.43 0.4 0.049 -0.0042 -0.061 -0.00082 -0.01 -0.02 -0.0097 -0.034 -0.065 -0.036 -0.035 0.028 -0.016 0.0093 -0.012 0.027 0.011 0.0079 0.035 0.022 0.037 0.044 0.005 -0.064 0.0065 -0.016 -0.017 -0.01 -0.033 -0.058 -0.03 -0.041 0.014 -0.012 0.0046 -0.01 0.016 0.017 0.0076 0.04 0.016 0.04 0.085
C17MOPLMIDD -0.28 -0.053 0.04 -0.22 -0.28 0.15 -0.016 0.013 -0.037 0.027 0.14 -0.086 -0.036 0.11 -0.049 0.0075 1.0 -0.75 0.16 0.037 -0.059 0.39 -0.31 -0.24 0.18 0.39 0.28 -0.36 -0.37 -0.14 0.14 0.11 0.024 -0.14 -0.22 0.23 -0.18 -0.00074 0.16 0.095 0.048 0.18 0.23 0.014 0.027 -0.02 0.0077 0.0079 0.0088 0.0098 -0.0018 -0.013 -0.005 -0.032 0.043 -0.0066 0.0029 0.00023 -0.0048 -0.0084 0.019 0.0083 0.021 0.036 0.0051 0.017 -0.019 0.01 0.0094 0.0077 0.015 -0.0013 -0.0012 -0.0076 -0.031 0.03 -0.0039 0.00079 0.0044 -0.0093 -0.0045 0.02 0.01 0.018 0.031 0.044
C18MOPLLAAG 0.53 0.037 -0.039 0.18 0.52 -0.25 0.056 -0.006 0.043 -0.082 -0.079 0.13 0.045 -0.13 0.063 -0.64 -0.75 1.0 -0.49 -0.2 0.12 -0.31 0.46 0.4 -0.59 -0.41 -0.2 0.63 0.46 0.27 -0.26 -0.18 -0.029 0.23 0.51 -0.51 0.31 0.2 -0.36 -0.27 -0.21 -0.42 -0.45 -0.044 -0.011 0.053 -0.0036 -0.00083 0.007 -0.0017 0.024 0.049 0.024 0.045 -0.05 0.015 -0.011 0.0073 -0.017 0.002 -0.019 -0.032 -0.03 -0.055 -0.035 -0.0074 0.054 -0.0094 0.0029 0.0053 -0.0053 0.023 0.035 0.025 0.047 -0.03 0.011 -0.0063 0.0044 -0.004 -0.0057 -0.019 -0.035 -0.023 -0.053 -0.091
C19MBERHOOG -0.42 -0.0057 0.024 0.14 -0.41 0.22 -0.011 -0.052 -0.057 0.18 -0.093 -0.15 -0.06 0.13 -0.044 0.56 0.16 -0.49 1.0 0.063 -0.1 -0.17 -0.39 -0.38 0.69 0.051 0.072 -0.54 -0.21 -0.33 0.34 0.13 0.14 -0.23 -0.61 0.61 -0.27 -0.22 0.33 0.27 0.22 0.36 0.39 0.037 0.0043 -0.015 0.0072 0.0015 0.0067 0.003 -0.0082 -0.01 -0.02 -0.036 0.034 -0.00028 0.0071 0.008 0.062 -0.001 0.012 0.033 0.022 0.021 0.029 -0.00069 -0.015 0.0099 -0.00044 0.0063 0.0082 -0.0086 -0.0061 -0.021 -0.039 0.016 0.0082 0.0071 0.0035 0.041 0.014 0.0043 0.036 0.015 0.022 0.065
C20MBERZELF -0.12 0.042 0.036 0.058 -0.12 0.057 0.043 0.0063 -0.067 0.088 -0.011 -0.074 -0.058 0.11 -0.035 0.27 0.037 -0.2 0.063 1.0 0.08 -0.12 -0.096 -0.2 0.36 -0.044 -0.029 -0.17 -0.16 -0.11 0.11 -0.025 0.16 -0.085 -0.21 0.21 -0.12 -0.13 0.21 0.093 0.11 0.19 0.082 0.0049 0.0087 0.0036 -0.013 0.0092 -0.031 -0.009 -0.013 -0.00055 -0.013 -0.014 -0.0092 -0.0062 -0.013 0.01 0.044 0.031 0.024 0.015 0.0063 0.0053 0.0011 0.0029 0.0019 -0.0042 0.0046 -0.022 -0.011 -0.012 0.0074 -0.013 -0.017 -0.014 -0.01 -0.014 0.012 0.021 0.027 0.019 0.024 0.011 0.011 0.022
C21MBERBOER 0.25 -0.018 0.11 0.091 0.28 -0.11 0.11 -0.047 -0.052 0.02 -0.09 0.039 -0.012 0.0022 0.028 -0.13 -0.059 0.12 -0.1 0.08 1.0 -0.28 -0.068 -0.067 0.099 -0.11 0.19 -0.14 0.024 -0.19 0.19 -0.16 0.17 0.034 -0.047 0.046 0.038 -0.037 0.032 -0.03 -0.043 -0.04 -0.11 -0.07 0.04 0.17 -0.04 0.0016 -0.0068 0.031 0.077 0.21 0.067 0.05 -0.0058 0.03 0.0055 0.027 0.087 0.035 -0.0056 -0.036 -0.02 -0.027 -0.069 0.038 0.18 -0.031 -0.0011 -0.0049 0.032 0.066 0.2 0.05 0.045 -0.0043 0.017 0.0064 0.038 0.016 0.017 -0.0065 -0.029 -0.02 -0.029 -0.054
C22MBERMIDD -0.21 0.0058 -0.0049 -0.16 -0.22 0.088 -0.032 0.078 -0.024 -0.0072 0.15 -0.056 -0.0021 0.027 -0.033 0.041 0.39 -0.31 -0.17 -0.12 -0.28 1.0 -0.35 -0.28 -0.17 0.52 0.16 -0.15 -0.22 0.018 -0.018 0.18 -0.12 -0.091 -0.002 0.0045 -0.14 0.13 0.022 0.0092 0.026 0.11 0.14 0.024 -0.0037 -0.047 0.011 0.0052 0.018 -0.0032 -0.0095 -0.064 -0.015 -0.03 0.009 -0.015 -0.0019 -0.002 -0.012 -0.012 -0.0085 0.0005 0.0074 0.037 0.022 -0.0065 -0.047 0.014 0.0051 0.014 -0.001 -0.0034 -0.058 -0.0033 -0.028 0.0059 -0.011 -0.0025 -0.0015 -0.0008 -0.015 0.0075 -0.0019 0.012 0.034 0.048
C23MBERARBG 0.3 0.025 0.038 -0.048 0.31 -0.18 0.017 -0.066 0.083 0.027 -0.026 -0.017 -0.035 -0.089 0.11 -0.34 -0.31 0.46 -0.39 -0.096 -0.068 -0.35 1.0 -0.11 -0.36 -0.29 -0.21 0.61 0.089 0.13 -0.13 -0.017 -0.047 0.066 0.39 -0.4 0.18 0.11 -0.19 -0.14 -0.12 -0.24 -0.22 -0.0077 0.0065 -0.023 -0.0028 -0.015 0.0033 -0.0073 -0.021 -0.023 -0.0038 0.019 -0.012 0.011 0.0043 -0.0088 -0.036 -0.00023 -0.0026 -0.0065 -0.0034 -0.048 -0.003 0.015 -0.024 -0.011 -0.013 0.00063 -0.011 -0.017 -0.029 -0.0031 0.021 0.0042 0.0098 0.0045 -0.0076 -0.021 0.0015 -0.0021 -0.01 -0.0012 -0.048 -0.042
C24MBERARBO 0.27 -0.052 -0.16 0.034 0.26 -0.072 -0.035 0.11 0.024 -0.3 0.056 0.31 0.16 -0.1 -0.059 -0.31 -0.24 0.4 -0.38 -0.2 -0.067 -0.28 -0.11 1.0 -0.38 -0.19 -0.072 0.26 0.45 0.34 -0.34 -0.24 -0.13 0.35 0.34 -0.34 0.27 0.06 -0.25 -0.13 -0.12 -0.3 -0.35 -0.017 -0.027 0.0037 -0.0027 0.012 -0.01 0.0025 0.015 -0.0078 0.01 0.028 -0.024 -0.007 -0.013 -0.013 -0.059 -0.015 -0.0055 -0.035 -0.021 -0.0059 -0.0094 -0.028 0.0023 -0.012 0.015 -0.0036 -0.0016 0.01 -0.011 0.0098 0.032 -0.015 -0.0091 -0.013 -0.016 -0.02 -0.018 -0.013 -0.037 -0.021 -0.0046 -0.055
C25MSKA -0.39 -0.014 0.063 0.048 -0.38 0.17 -0.011 -0.033 -0.053 0.18 -0.092 -0.17 -0.096 0.14 -0.015 0.69 0.18 -0.59 0.69 0.36 0.099 -0.17 -0.36 -0.38 1.0 -0.059 -0.096 -0.56 -0.25 -0.32 0.32 0.12 0.17 -0.25 -0.58 0.58 -0.29 -0.27 0.42 0.24 0.25 0.4 0.37 0.011 0.0097 2.9e-05 0.0047 -0.0051 -0.017 0.0081 -0.0083 0.0059 -0.01 -0.022 0.044 0.0087 0.00091 -0.0016 0.056 0.011 0.018 0.024 0.025 0.019 0.0032 0.015 -0.0017 0.012 -0.0091 -0.0097 0.011 -0.011 0.019 -0.016 -0.03 0.023 0.0093 -0.00073 0.00031 0.021 0.027 0.011 0.031 0.018 0.023 0.063
C26MSKB1 -0.19 0.0099 0.012 -0.16 -0.19 0.14 -0.041 0.054 -0.023 -0.029 0.17 -0.023 0.055 0.017 -0.058 0.18 0.39 -0.41 0.051 -0.044 -0.11 0.52 -0.29 -0.19 -0.059 1.0 -0.096 -0.32 -0.27 -0.071 0.069 0.096 -0.049 -0.052 -0.14 0.14 -0.14 0.0032 0.11 0.099 0.049 0.14 0.17 0.029 -0.0048 -0.0049 -0.014 0.0072 0.0077 0.0082 -0.0012 -0.0047 0.023 -0.03 0.005 -0.0065 0.011 0.014 0.0054 0.0088 0.0025 0.012 0.0024 0.01 0.024 -0.015 -0.0044 -0.014 0.0059 0.012 0.0084 0.0028 -0.0059 0.023 -0.028 -0.0023 -0.0032 0.011 0.013 0.0032 0.001 0.0075 0.0066 -0.0013 0.01 0.029
C27MSKB2 -0.038 -0.025 0.059 -0.04 -0.03 0.08 0.029 -0.026 -0.042 0.017 0.077 -0.046 -0.026 0.014 -0.0038 -0.019 0.28 -0.2 0.072 -0.029 0.19 0.16 -0.21 -0.072 -0.096 -0.096 1.0 -0.4 -0.21 -0.16 0.16 0.021 0.062 -0.074 -0.084 0.085 -0.05 0.016 0.018 0.053 0.051 0.11 0.06 -0.0025 0.026 0.029 -0.0037 0.011 0.0079 -0.00098 0.024 0.053 0.01 0.011 0.016 0.0026 0.0017 0.02 0.025 0.01 -0.0088 -0.025 -0.014 0.035 -0.0053 0.03 0.032 0.0056 0.0057 0.0018 0.00066 0.023 0.045 0.011 0.011 0.018 0.0057 -0.00097 0.017 0.0042 -0.008 -0.0042 -0.019 -0.017 0.025 0.004
C28MSKC 0.39 -0.0073 -0.019 -0.064 0.38 -0.22 0.058 0.0017 0.031 -0.036 -0.011 0.037 -0.054 -0.062 0.095 -0.53 -0.36 0.63 -0.54 -0.17 -0.14 -0.15 0.61 0.26 -0.56 -0.32 -0.4 1.0 0.016 0.28 -0.28 -0.047 -0.1 0.12 0.5 -0.5 0.22 0.19 -0.26 -0.21 -0.21 -0.33 -0.33 -0.03 -0.011 -0.0049 0.017 -0.024 -0.0057 -0.00026 -0.0052 -0.031 -0.011 0.027 -0.041 -0.0016 -0.0007 -0.02 -0.046 -0.014 -0.013 -0.016 -0.007 -0.036 -0.023 -0.011 -0.005 0.0039 -0.016 -0.0083 -0.0026 -0.0031 -0.033 -0.0067 0.031 -0.024 -0.0031 0.0013 -0.017 -0.019 -0.013 -0.0082 -0.023 -0.00088 -0.033 -0.042
C29MSKD 0.2 0.021 -0.19 0.24 0.19 -0.075 -0.07 0.068 0.083 -0.24 -0.048 0.31 0.26 -0.13 -0.098 -0.26 -0.37 0.46 -0.21 -0.16 0.024 -0.22 0.089 0.45 -0.25 -0.27 -0.21 0.016 1.0 0.3 -0.3 -0.23 -0.14 0.38 0.26 -0.26 0.31 0.034 -0.28 -0.14 -0.069 -0.32 -0.32 0.0012 -0.026 -0.022 -0.032 0.026 0.0069 -0.0031 -0.0057 -0.026 -0.021 0.01 -0.021 0.014 -0.021 -0.024 -0.057 -0.016 0.007 -0.019 -0.0097 -0.042 0.0094 -0.027 -0.022 -0.032 0.026 0.0086 -0.006 -0.011 -0.037 -0.015 0.012 -0.0098 0.0071 -0.021 -0.024 -0.011 -0.019 0.001 -0.02 -0.01 -0.04 -0.063
C30MHHUUR 0.15 -0.049 -0.34 0.025 0.12 -0.096 -0.22 0.15 0.22 -0.39 0.16 0.37 0.35 -0.048 -0.24 -0.23 -0.14 0.27 -0.33 -0.11 -0.19 0.018 0.13 0.34 -0.32 -0.071 -0.16 0.28 0.3 1.0 -1.0 -0.19 -0.26 0.39 0.37 -0.37 0.5 0.013 -0.35 -0.24 -0.17 -0.43 -0.43 -0.016 -0.033 -0.054 -0.0086 -0.01 -0.027 -0.016 -0.029 -0.062 -0.027 -0.018 -0.038 0.0019 -0.02 -0.03 -0.18 -0.021 -0.022 -0.027 -0.0083 -0.042 -0.0072 -0.035 -0.054 -0.021 -0.0092 -0.022 -0.014 -0.026 -0.059 -0.015 -0.012 -0.026 0.0051 -0.022 -0.028 -0.082 -0.024 -0.026 -0.03 -0.012 -0.037 -0.079
C31MHKOOP -0.15 0.048 0.34 -0.023 -0.12 0.096 0.22 -0.15 -0.23 0.39 -0.16 -0.37 -0.35 0.053 0.24 0.23 0.14 -0.26 0.34 0.11 0.19 -0.018 -0.13 -0.34 0.32 0.069 0.16 -0.28 -0.3 -1.0 1.0 0.19 0.25 -0.4 -0.37 0.37 -0.49 -0.012 0.35 0.24 0.17 0.42 0.43 0.016 0.032 0.054 0.0083 0.01 0.028 0.016 0.028 0.062 0.027 0.018 0.04 -0.0021 0.022 0.03 0.18 0.021 0.022 0.027 0.0081 0.042 0.0071 0.035 0.054 0.021 0.0089 0.022 0.014 0.025 0.06 0.015 0.012 0.026 -0.0053 0.024 0.028 0.082 0.024 0.026 0.03 0.012 0.038 0.078
C32MAUT1 -0.18 -0.011 0.25 -0.13 -0.17 -0.02 -0.018 -0.0091 0.013 0.42 -0.093 -0.43 -0.33 0.11 0.18 0.14 0.11 -0.18 0.13 -0.025 -0.16 0.18 -0.017 -0.24 0.12 0.096 0.021 -0.047 -0.23 -0.19 0.19 1.0 -0.39 -0.73 -0.1 0.11 -0.32 0.096 0.22 0.057 -0.083 0.23 0.27 0.024 0.006 -0.0086 0.047 -0.0099 0.014 0.0008 -0.015 -0.026 -0.013 -0.012 0.031 -0.0096 0.043 -0.0084 0.02 -0.0056 -0.019 0.027 0.032 0.03 0.016 0.011 -0.0084 0.047 -0.019 0.011 -0.00092 -0.013 -0.022 -0.016 -0.009 0.034 -0.013 0.046 -0.0058 -0.0087 0.0092 -0.01 0.029 0.034 0.033 0.07
C33MAUT2 0.038 0.024 0.2 -0.089 0.063 0.037 0.11 -0.1 -0.063 0.23 0.031 -0.28 -0.24 -0.012 0.21 0.01 0.024 -0.029 0.14 0.16 0.17 -0.12 -0.047 -0.13 0.17 -0.049 0.062 -0.1 -0.14 -0.26 0.25 -0.39 1.0 -0.28 -0.23 0.23 -0.19 0.021 0.074 0.12 0.16 0.18 0.16 -0.036 0.027 0.039 -0.00018 0.0014 0.0072 0.014 0.035 0.065 0.014 0.031 0.0086 0.0073 -0.013 0.028 0.069 0.034 0.01 -0.0096 -0.00055 0.0092 -0.039 0.026 0.039 0.0044 0.0052 0.004 0.018 0.034 0.063 0.017 0.028 -0.0068 0.0063 -0.016 0.023 0.021 0.019 0.0034 -0.0092 0.0054 0.0021 0.0078
C34MAUT0 0.16 -0.011 -0.4 0.2 0.13 0.0049 -0.082 0.11 0.055 -0.61 0.11 0.66 0.52 -0.11 -0.34 -0.16 -0.14 0.23 -0.23 -0.085 0.034 -0.091 0.066 0.35 -0.25 -0.052 -0.074 0.12 0.38 0.39 -0.4 -0.73 -0.28 1.0 0.29 -0.3 0.47 -0.093 -0.3 -0.14 -0.042 -0.38 -0.4 -0.0035 -0.019 -0.029 -0.053 0.013 -0.015 -0.012 -0.0086 -0.034 0.0029 -0.004 -0.041 0.0042 -0.034 -0.0067 -0.078 -0.024 0.0092 -0.019 -0.027 -0.044 0.0063 -0.021 -0.029 -0.057 0.022 -0.0081 -0.013 -0.0094 -0.038 0.0082 -0.0058 -0.031 0.0048 -0.035 -0.0068 -0.0089 -0.028 0.0032 -0.024 -0.033 -0.044 -0.077
C35MZFONDS 0.35 -0.016 -0.073 -0.12 0.34 -0.22 -0.042 0.11 0.1 -0.23 0.15 0.18 0.12 -0.083 -0.034 -0.52 -0.22 0.51 -0.61 -0.21 -0.047 -0.002 0.39 0.34 -0.58 -0.14 -0.084 0.5 0.26 0.37 -0.37 -0.1 -0.23 0.29 1.0 -1.0 0.3 0.24 -0.34 -0.32 -0.24 -0.42 -0.39 -0.02 -0.0066 -0.013 -0.01 0.0013 0.013 -0.004 -0.011 -0.033 0.0019 0.015 -0.045 -0.017 -0.0053 -0.0053 -0.072 -0.039 -0.014 -0.021 -0.015 -0.036 -0.014 -0.007 -0.013 -0.019 0.0021 0.017 -0.005 -0.0057 -0.033 0.0028 0.016 -0.029 -0.02 -0.0027 0.0017 -0.026 -0.041 -0.01 -0.024 -0.0055 -0.035 -0.058
C36MZPART -0.35 0.015 0.074 0.12 -0.34 0.22 0.046 -0.12 -0.1 0.23 -0.15 -0.19 -0.12 0.088 0.032 0.52 0.23 -0.51 0.61 0.21 0.046 0.0045 -0.4 -0.34 0.58 0.14 0.085 -0.5 -0.26 -0.37 0.37 0.11 0.23 -0.3 -1.0 1.0 -0.3 -0.23 0.34 0.31 0.24 0.42 0.39 0.019 0.0063 0.014 0.009 -0.0016 -0.013 0.0039 0.012 0.034 -0.002 -0.015 0.045 0.017 0.0059 0.0051 0.071 0.039 0.013 0.022 0.016 0.036 0.012 0.0067 0.014 0.018 -0.0023 -0.016 0.0049 0.0061 0.035 -0.003 -0.016 0.029 0.02 0.0036 -0.0019 0.026 0.041 0.01 0.025 0.0062 0.035 0.058
C37MINKMthirty 0.2 0.015 -0.37 0.18 0.18 -0.093 -0.063 0.0087 0.11 -0.44 0.067 0.47 0.48 -0.1 -0.3 -0.26 -0.18 0.31 -0.27 -0.12 0.038 -0.14 0.18 0.27 -0.29 -0.14 -0.05 0.22 0.31 0.5 -0.49 -0.32 -0.19 0.47 0.3 -0.3 1.0 -0.32 -0.55 -0.27 -0.11 -0.65 -0.4 -0.0043 0.0039 -0.02 -0.042 0.025 -0.00025 -0.013 -0.012 -0.0062 0.02 0.0093 -0.031 0.00043 -0.022 0.0073 -0.083 0.004 -0.012 -0.023 -0.0093 -0.052 0.0045 -0.0014 -0.02 -0.037 0.027 0.0064 -0.018 -0.016 -0.011 0.019 0.0099 -0.014 -0.0054 -0.019 0.0069 -0.025 -0.0099 -0.012 -0.025 -0.018 -0.055 -0.08
C38MINK3045 0.16 -0.024 0.044 -0.054 0.16 -0.11 0.059 0.029 -0.026 0.086 0.02 -0.11 -0.12 0.036 0.064 -0.29 -0.00074 0.2 -0.22 -0.13 -0.037 0.13 0.11 0.06 -0.27 0.0032 0.016 0.19 0.034 0.013 -0.012 0.096 0.021 -0.093 0.24 -0.23 -0.32 1.0 -0.42 -0.27 -0.21 -0.23 -0.12 -0.0068 -0.00048 0.022 0.025 -0.018 0.025 0.02 0.011 0.00016 -0.015 0.0058 -0.0009 0.021 -0.022 0.022 -0.0042 -0.0098 0.0013 -0.0045 -0.0088 -0.0023 -0.012 -0.0062 0.021 0.017 -0.02 0.017 0.023 0.014 -0.0042 -0.0068 0.011 -0.0078 0.021 -0.017 0.023 0.0042 0.0016 0.0047 -0.0079 -0.0057 0.00038 -0.00098
C39MINK4575 -0.26 0.015 0.26 -0.15 -0.24 0.11 0.037 -0.0085 -0.081 0.25 -0.038 -0.27 -0.27 0.074 0.17 0.35 0.16 -0.36 0.33 0.21 0.032 0.022 -0.19 -0.25 0.42 0.11 0.018 -0.26 -0.28 -0.35 0.35 0.22 0.074 -0.3 -0.34 0.34 -0.55 -0.42 1.0 0.036 0.046 0.52 0.36 -0.0019 -0.0075 0.0041 0.027 -0.0044 -0.018 0.0039 -0.011 0.017 -0.0022 -0.012 0.032 -0.012 0.04 -0.022 0.064 -0.012 0.014 0.02 0.016 0.036 -0.0059 0.0021 0.0058 0.026 -0.0032 -0.015 0.0035 -0.0081 0.029 -0.0096 -0.014 0.024 -0.00082 0.036 -0.02 0.0083 -0.00076 0.011 0.026 0.023 0.037 0.058
C40MINK7512 -0.19 0.0019 0.1 0.028 -0.18 0.19 -0.061 -0.015 0.0012 0.18 0.018 -0.18 -0.17 0.039 0.099 0.31 0.095 -0.27 0.27 0.093 -0.03 0.0092 -0.14 -0.13 0.24 0.099 0.053 -0.21 -0.14 -0.24 0.24 0.057 0.12 -0.14 -0.32 0.31 -0.27 -0.27 0.036 1.0 0.21 0.62 0.27 0.026 0.001 -0.0035 -0.0077 -0.0091 0.00069 -0.0029 0.024 -0.015 -0.02 -0.022 0.0066 -0.0097 0.029 -0.013 0.038 0.027 -0.0041 0.0037 0.0047 0.029 0.026 0.0039 -0.0037 -0.0087 -0.013 0.0024 -0.00082 0.021 -0.016 -0.015 -0.024 0.00076 -0.014 0.023 -0.014 0.022 0.01 -0.0034 0.0047 0.00062 0.028 0.058
C41MINK123M -0.14 -0.029 0.088 0.018 -0.13 0.22 -0.095 -0.048 0.049 0.064 0.031 -0.07 -0.078 0.011 0.052 0.29 0.048 -0.21 0.22 0.11 -0.043 0.026 -0.12 -0.12 0.25 0.049 0.051 -0.21 -0.069 -0.17 0.17 -0.083 0.16 -0.042 -0.24 0.24 -0.11 -0.21 0.046 0.21 1.0 0.3 0.17 0.01 0.0032 -0.017 -0.029 -0.0052 -0.0021 -0.014 -0.004 -0.032 -0.021 -0.0018 0.0035 -0.0048 -0.0098 -0.0081 0.03 -0.0073 0.0076 0.0024 0.019 0.018 0.01 0.0036 -0.014 -0.023 -0.0057 -0.006 -0.013 -0.002 -0.028 -0.018 -0.0048 0.0005 -0.0098 -0.01 -0.0059 0.023 -0.0083 0.0073 0.00074 0.02 0.013 -0.002
C42MINKGEM -0.3 -0.0094 0.28 -0.076 -0.28 0.18 -0.011 -0.039 -0.052 0.35 -0.05 -0.37 -0.36 0.11 0.21 0.43 0.18 -0.42 0.36 0.19 -0.04 0.11 -0.24 -0.3 0.4 0.14 0.11 -0.33 -0.32 -0.43 0.42 0.23 0.18 -0.38 -0.42 0.42 -0.65 -0.23 0.52 0.62 0.3 1.0 0.45 0.02 0.011 0.0062 0.024 -0.015 -0.0022 -0.0081 0.016 -0.0047 -0.016 -0.012 0.032 -0.014 0.028 -0.013 0.08 0.012 0.0099 0.025 0.014 0.05 0.015 0.017 0.0072 0.027 -0.017 -0.0065 -0.0046 0.015 0.0023 -0.015 -0.015 0.018 -0.013 0.023 -0.014 0.021 0.0095 0.012 0.033 0.016 0.049 0.09
C43MKOOPKLA -0.57 -0.024 0.32 -0.15 -0.54 0.15 0.017 -0.068 -0.057 0.33 -0.062 -0.34 -0.3 -0.027 0.27 0.4 0.23 -0.45 0.39 0.082 -0.11 0.14 -0.22 -0.35 0.37 0.17 0.06 -0.33 -0.32 -0.43 0.43 0.27 0.16 -0.4 -0.39 0.39 -0.4 -0.12 0.36 0.27 0.17 0.45 1.0 0.012 0.027 -0.016 0.023 0.0086 0.012 -0.0059 -0.016 -0.012 0.003 0.0012 0.067 -0.016 0.032 0.018 0.1 -0.0043 0.019 0.037 0.0019 0.071 0.0032 0.029 -0.016 0.034 0.007 0.017 -0.0042 -0.016 -0.015 0.00034 -0.0077 0.054 -0.0098 0.032 0.016 0.024 -0.0064 0.02 0.041 0.013 0.064 0.096
C44PWAPART -0.04 0.041 -0.042 -0.018 -0.049 0.053 -0.014 0.019 -0.021 -0.039 0.013 0.036 0.044 -0.014 -0.034 0.049 0.014 -0.044 0.037 0.0049 -0.07 0.024 -0.0077 -0.017 0.011 0.029 -0.0025 -0.03 0.0012 -0.016 0.016 0.024 -0.036 -0.0035 -0.02 0.019 -0.0043 -0.0068 -0.0019 0.026 0.01 0.02 0.012 1.0 -0.038 -0.11 0.15 -0.041 0.019 -0.022 -0.026 -0.075 -0.027 -0.15 0.13 -0.019 0.057 -0.0017 0.48 0.017 0.0021 -0.011 0.04 0.044 0.98 -0.045 -0.11 0.14 -0.039 0.017 -0.023 -0.025 -0.074 -0.026 -0.15 0.13 -0.0096 0.057 -0.0019 0.52 0.013 -0.0022 -0.0023 0.045 0.046 0.096
C45PWABEDR -0.0099 -0.0032 0.027 -0.0046 -0.0063 -0.019 0.024 -0.0034 -0.014 0.015 -0.0058 -0.0098 -0.015 0.00097 0.014 -0.0042 0.027 -0.011 0.0043 0.0087 0.04 -0.0037 0.0065 -0.027 0.0097 -0.0048 0.026 -0.011 -0.026 -0.033 0.032 0.006 0.027 -0.019 -0.0066 0.0063 0.0039 -0.00048 -0.0075 0.001 0.0032 0.011 0.027 -0.038 1.0 0.022 -0.0088 0.29 -0.0089 0.15 0.13 0.066 0.2 -0.029 0.035 -0.0072 -0.0088 0.2 0.11 -0.0022 -0.0076 -0.018 0.029 0.0011 -0.04 0.9 0.024 -0.0086 0.31 -0.012 0.13 0.14 0.061 0.22 -0.029 0.018 -0.0081 -0.0089 0.21 0.061 -0.0025 -0.0081 -0.017 0.022 -0.0013 -0.0019
C46PWALAND 0.076 -0.028 0.034 0.0079 0.086 -0.054 0.037 -0.019 -0.014 0.008 -0.0019 -0.0082 -0.029 0.0074 0.019 -0.061 -0.02 0.053 -0.015 0.0036 0.17 -0.047 -0.023 0.0037 2.9e-05 -0.0049 0.029 -0.0049 -0.022 -0.054 0.054 -0.0086 0.039 -0.029 -0.013 0.014 -0.02 0.022 0.0041 -0.0035 -0.017 0.0062 -0.016 -0.11 0.022 1.0 0.074 0.024 -0.0081 -0.0055 0.088 0.57 0.12 -0.0096 0.0079 0.056 0.014 0.057 0.23 -0.0028 -0.0061 -0.016 -0.0042 0.0035 -0.11 0.028 0.99 0.091 0.015 -0.0062 -0.0051 0.076 0.55 0.1 -0.0057 0.0028 0.046 0.018 0.062 0.13 -0.0033 0.0021 -0.017 0.0027 0.00016 -0.022
C47PPERSAUT -0.0085 -0.026 0.021 -0.015 -0.0057 -0.00025 0.019 -0.023 -0.018 0.026 0.0033 -0.033 -0.045 0.00075 0.033 -0.00082 0.0077 -0.0036 0.0072 -0.013 -0.04 0.011 -0.0028 -0.0027 0.0047 -0.014 -0.0037 0.017 -0.032 -0.0086 0.0083 0.047 -0.00018 -0.053 -0.01 0.009 -0.042 0.025 0.027 -0.0077 -0.029 0.024 0.023 0.15 -0.0088 0.074 1.0 0.014 0.057 0.0076 0.028 0.078 0.021 -0.19 0.071 0.0001 0.048 -0.0074 0.091 0.0043 0.025 -0.04 0.015 0.078 0.14 -0.022 0.075 0.92 0.019 0.054 0.006 0.031 0.071 0.025 -0.18 0.077 0.0072 0.046 -0.0047 0.018 -0.0075 0.032 -0.032 0.013 0.085 0.15
C48PBESAUT -0.028 -0.02 -0.0032 0.019 -0.027 0.0056 0.0073 0.009 -0.015 -0.0091 -0.01 0.018 0.012 -0.0021 -0.0086 -0.01 0.0079 -0.00083 0.0015 0.0092 0.0016 0.0052 -0.015 0.012 -0.0051 0.0072 0.011 -0.024 0.026 -0.01 0.01 -0.0099 0.0014 0.013 0.0013 -0.0016 0.025 -0.018 -0.0044 -0.0091 -0.0052 -0.015 0.0086 -0.041 0.29 0.024 0.014 1.0 0.032 0.22 0.14 0.055 0.21 -0.024 0.026 -0.006 -0.0072 0.056 0.034 -0.0018 -0.0063 -0.0023 -0.0069 0.018 -0.041 0.19 0.026 0.039 0.9 0.026 0.18 0.15 0.049 0.23 -0.024 0.048 -0.0066 -0.0074 0.045 0.0033 -0.0021 -0.0067 -0.0045 -0.0079 0.022 -0.0065
C49PMOTSCO -0.0036 -0.016 0.022 -0.017 -0.0035 0.013 -0.0044 -0.0014 -0.0026 0.021 0.016 -0.026 -0.022 0.0019 0.016 -0.02 0.0088 0.007 0.0067 -0.031 -0.0068 0.018 0.0033 -0.01 -0.017 0.0077 0.0079 -0.0057 0.0069 -0.027 0.028 0.014 0.0072 -0.015 0.013 -0.013 -0.00025 0.025 -0.018 0.00069 -0.0021 -0.0022 0.012 0.019 -0.0089 -0.0081 0.057 0.032 1.0 -0.0075 0.0059 -0.0072 -0.011 -0.042 0.039 0.018 -0.0076 0.0061 0.0038 -0.0039 0.0033 -0.013 0.0075 0.021 0.014 -0.013 -0.0068 0.073 0.018 0.9 -0.0069 0.0094 -0.0082 -0.0097 -0.042 0.032 0.028 -0.0063 0.0031 -0.018 -0.0044 -0.00032 -0.011 0.013 0.025 0.0056
C50PVRAAUT 0.018 -0.011 -0.0003 -0.0099 0.021 -0.0058 0.027 -0.0068 -0.023 0.0073 0.0061 -0.0061 0.0012 0.0075 -0.0075 -0.0097 0.0098 -0.0017 0.003 -0.009 0.031 -0.0032 -0.0073 0.0025 0.0081 0.0082 -0.00098 -0.00026 -0.0031 -0.016 0.016 0.0008 0.014 -0.012 -0.004 0.0039 -0.013 0.02 0.0039 -0.0029 -0.014 -0.0081 -0.0059 -0.022 0.15 -0.0055 0.0076 0.22 -0.0075 1.0 0.13 0.064 0.11 -0.01 -0.0084 -0.0025 -0.0031 0.065 0.015 -0.00076 -0.0027 0.021 -0.003 -0.0045 -0.023 0.11 -0.0056 0.028 0.21 -0.0069 0.95 0.13 0.065 0.066 -0.01 -0.0078 -0.0028 -0.0031 0.052 -0.0042 -0.00088 -0.0028 0.014 -0.0034 -0.0046 -0.0097
C51PAANHANG 0.032 -0.011 0.012 0.013 0.036 -0.0096 0.02 -0.016 -0.0079 0.015 -0.011 -0.0086 -0.01 0.017 -0.0047 -0.034 -0.0018 0.024 -0.0082 -0.013 0.077 -0.0095 -0.021 0.015 -0.0083 -0.0012 0.024 -0.0052 -0.0057 -0.029 0.028 -0.015 0.035 -0.0086 -0.011 0.012 -0.012 0.011 -0.011 0.024 -0.004 0.016 -0.016 -0.026 0.13 0.088 0.028 0.14 0.0059 0.13 1.0 0.083 0.12 -0.0072 -0.007 0.009 0.00057 0.058 0.057 0.035 0.064 -0.0056 0.016 -0.011 -0.028 0.098 0.094 0.046 0.17 0.0035 0.086 0.97 0.077 0.18 -0.011 -0.00074 0.015 0.002 0.046 0.034 0.069 0.032 -0.0072 0.027 -0.012 0.013
C52PTRACTOR 0.097 -0.031 0.059 -0.00045 0.11 -0.049 0.044 -0.033 -0.013 0.016 -0.0083 -0.0098 -0.032 -0.0058 0.036 -0.065 -0.013 0.049 -0.01 -0.00055 0.21 -0.064 -0.023 -0.0078 0.0059 -0.0047 0.053 -0.031 -0.026 -0.062 0.062 -0.026 0.065 -0.034 -0.033 0.034 -0.0062 0.00016 0.017 -0.015 -0.032 -0.0047 -0.012 -0.075 0.066 0.57 0.078 0.055 -0.0072 0.064 0.083 1.0 0.22 -0.0024 0.00032 0.063 0.025 0.086 0.21 -0.003 -0.0012 -0.01 -0.0076 0.029 -0.076 0.077 0.57 0.098 0.058 -0.0051 0.063 0.066 0.93 0.18 -0.001 -0.01 0.055 0.023 0.079 0.11 -0.0035 -0.00084 -0.012 -0.004 0.022 -0.012
C53PWERKT 0.032 -0.0081 0.032 0.0034 0.037 -0.0052 0.011 0.0012 -0.012 0.015 -0.024 -0.0057 -0.0069 -0.003 0.0083 -0.036 -0.005 0.024 -0.02 -0.013 0.067 -0.015 -0.0038 0.01 -0.01 0.023 0.01 -0.011 -0.021 -0.027 0.027 -0.013 0.014 0.0029 0.0019 -0.002 0.02 -0.015 -0.0022 -0.02 -0.021 -0.016 0.003 -0.027 0.2 0.12 0.021 0.21 -0.011 0.11 0.12 0.22 1.0 0.0015 0.014 0.097 -0.0045 0.092 0.063 -0.0011 -0.0039 -0.0092 -0.0044 -0.0066 -0.028 0.13 0.13 0.036 0.28 -0.01 0.07 0.13 0.25 0.91 0.0018 -0.0036 0.058 -0.0046 0.074 0.028 -0.0013 -0.0042 -0.0086 -0.005 -0.0068 -0.014
C54PBROM 0.025 -0.014 0.023 -0.0039 0.03 -0.02 0.014 -0.031 0.011 0.014 0.0019 -0.019 -0.034 0.008 0.02 -0.035 -0.032 0.045 -0.036 -0.014 0.05 -0.03 0.019 0.028 -0.022 -0.03 0.011 0.027 0.01 -0.018 0.018 -0.012 0.031 -0.004 0.015 -0.015 0.0093 0.0058 -0.012 -0.022 -0.0018 -0.012 0.0012 -0.15 -0.029 -0.0096 -0.19 -0.024 -0.042 -0.01 -0.0072 -0.0024 0.0015 1.0 -0.041 -0.017 -0.0045 -0.017 -0.17 -0.0052 -0.018 -0.025 -0.02 -0.017 -0.15 -0.029 -0.01 -0.17 -0.021 -0.039 -0.0094 -0.0079 0.006 -0.003 0.97 -0.042 -0.019 -0.0057 -0.016 -0.19 -0.006 -0.019 -0.027 -0.023 -0.017 -0.044
C55PLEVEN -0.043 0.019 0.026 -0.029 -0.04 0.0014 -0.012 -0.019 0.012 0.032 0.0022 -0.034 -0.029 -0.0026 0.026 0.028 0.043 -0.05 0.034 -0.0092 -0.0058 0.009 -0.012 -0.024 0.044 0.005 0.016 -0.041 -0.021 -0.038 0.04 0.031 0.0086 -0.041 -0.045 0.045 -0.031 -0.0009 0.032 0.0066 0.0035 0.032 0.067 0.13 0.035 0.0079 0.071 0.026 0.039 -0.0084 -0.007 0.00032 0.014 -0.041 1.0 0.035 0.13 -0.0014 0.12 -0.0043 0.019 -0.002 0.022 0.04 0.12 0.02 0.0049 0.073 0.013 0.045 -0.0077 -0.0019 0.00065 0.014 -0.041 0.85 0.039 0.13 -0.0031 0.085 -0.0049 0.0075 -0.0055 0.028 0.04 0.021
C56PPERSONG 0.028 0.0084 0.0038 0.02 0.031 -0.008 -0.02 -0.00048 0.017 0.017 -0.026 -0.0096 -0.0023 -0.0048 0.0061 -0.016 -0.0066 0.015 -0.00028 -0.0062 0.03 -0.015 0.011 -0.007 0.0087 -0.0065 0.0026 -0.0016 0.014 0.0019 -0.0021 -0.0096 0.0073 0.0042 -0.017 0.017 0.00043 0.021 -0.012 -0.0097 -0.0048 -0.014 -0.016 -0.019 -0.0072 0.056 0.0001 -0.006 0.018 -0.0025 0.009 0.063 0.097 -0.017 0.035 1.0 -0.0052 -0.0041 0.014 -0.0013 0.0015 -0.011 -0.005 -0.0076 -0.02 -0.0072 0.048 0.0041 -0.0053 0.0097 -0.0023 0.0065 0.059 0.13 -0.017 0.013 0.9 -0.0053 -0.0039 0.0079 -0.0015 0.015 -0.0099 -0.0057 -0.0078 -0.0096
C57PGEZONG -0.015 0.011 0.016 -0.0068 -0.013 0.008 0.00063 -0.0046 0.0061 0.021 -0.0034 -0.024 -0.034 0.0085 0.016 0.0093 0.0029 -0.011 0.0071 -0.013 0.0055 -0.0019 0.0043 -0.013 0.00091 0.011 0.0017 -0.0007 -0.021 -0.02 0.022 0.043 -0.013 -0.034 -0.0053 0.0059 -0.022 -0.022 0.04 0.029 -0.0098 0.028 0.032 0.057 -0.0088 0.014 0.048 -0.0072 -0.0076 -0.0031 0.00057 0.025 -0.0045 -0.0045 0.13 -0.0052 1.0 -0.005 0.058 -0.0016 -0.0055 0.01 0.011 0.13 0.055 -0.0087 0.014 0.049 -0.0064 -0.0064 -0.0028 0.0063 0.026 -0.004 -0.0043 0.11 -0.0058 0.98 -0.0048 0.04 -0.0018 -0.0059 0.03 0.013 0.12 0.04
C58PWAOREG 0.0014 -0.017 0.017 -0.0016 0.0014 -0.014 0.0064 0.006 -0.0013 0.013 -0.01 -0.0063 -0.0065 -0.0021 0.0061 -0.012 0.00023 0.0073 0.008 0.01 0.027 -0.002 -0.0088 -0.013 -0.0016 0.014 0.02 -0.02 -0.024 -0.03 0.03 -0.0084 0.028 -0.0067 -0.0053 0.0051 0.0073 0.022 -0.022 -0.013 -0.0081 -0.013 0.018 -0.0017 0.2 0.057 -0.0074 0.056 0.0061 0.065 0.058 0.086 0.092 -0.017 -0.0014 -0.0041 -0.005 1.0 0.065 -0.0012 -0.0043 -0.01 -0.0048 -0.0073 -0.0095 0.19 0.049 -1.4e-05 0.037 0.00075 0.041 0.059 0.082 0.063 -0.017 -0.0054 -0.0046 -0.0051 0.95 0.028 -0.0014 -0.0046 -0.0094 -0.0055 -0.0075 0.031
C59PBRAND -0.0098 -0.0014 0.056 0.019 -0.00032 -0.0039 0.077 -0.02 -0.074 0.058 -0.039 -0.042 -0.052 0.012 0.033 0.027 -0.0048 -0.017 0.062 0.044 0.087 -0.012 -0.036 -0.059 0.056 0.0054 0.025 -0.046 -0.057 -0.18 0.18 0.02 0.069 -0.078 -0.072 0.071 -0.083 -0.0042 0.064 0.038 0.03 0.08 0.1 0.48 0.11 0.23 0.091 0.034 0.0038 0.015 0.057 0.21 0.063 -0.17 0.12 0.014 0.058 0.065 1.0 0.014 0.019 -0.038 0.033 0.061 0.49 0.082 0.23 0.1 0.031 0.0029 0.011 0.053 0.2 0.053 -0.17 0.13 0.014 0.059 0.074 0.87 0.01 0.013 -0.027 0.037 0.054 0.094
C60PZEILPL 0.014 -0.0054 0.008 0.00021 0.015 0.0099 0.0089 -0.021 -0.011 0.013 -0.018 -0.0079 -0.0097 0.0045 0.0089 0.011 -0.0084 0.002 -0.001 0.031 0.035 -0.012 -0.00023 -0.015 0.011 0.0088 0.01 -0.014 -0.016 -0.021 0.021 -0.0056 0.034 -0.024 -0.039 0.039 0.004 -0.0098 -0.012 0.027 -0.0073 0.012 -0.0043 0.017 -0.0022 -0.0028 0.0043 -0.0018 -0.0039 -0.00076 0.035 -0.003 -0.0011 -0.0052 -0.0043 -0.0013 -0.0016 -0.0012 0.014 1.0 0.085 -0.0032 0.018 -0.0023 0.016 -0.0022 -0.0029 0.0012 -0.0016 -0.0035 -0.0007 0.029 -0.0028 -0.00098 -0.0052 -0.004 -0.0014 -0.0016 -0.0012 0.0081 0.87 0.047 -0.003 0.042 -0.0023 0.012
C61PPLEZIER -0.018 0.0044 0.0019 -0.0047 -0.021 0.013 0.018 0.0051 -0.024 -0.011 0.021 0.0026 -0.012 0.015 -0.012 0.0079 0.019 -0.019 0.012 0.024 -0.0056 -0.0085 -0.0026 -0.0055 0.018 0.0025 -0.0088 -0.013 0.007 -0.022 0.022 -0.019 0.01 0.0092 -0.014 0.013 -0.012 0.0013 0.014 -0.0041 0.0076 0.0099 0.019 0.0021 -0.0076 -0.0061 0.025 -0.0063 0.0033 -0.0027 0.064 -0.0012 -0.0039 -0.018 0.019 0.0015 -0.0055 -0.0043 0.019 0.085 1.0 -0.011 0.038 0.01 -0.00042 -0.0076 -0.0056 0.044 -0.0056 -0.0014 -0.0025 0.053 -0.0018 -0.0034 -0.018 0.0026 0.0036 -0.0056 -0.0041 0.0048 0.16 0.9 -0.01 0.064 0.013 0.091
C62PFIETS -0.012 -0.017 0.031 0.019 -0.015 0.0018 0.0056 0.027 -0.018 0.018 -0.036 -0.0049 -0.0088 -0.006 0.0081 0.035 0.0083 -0.032 0.033 0.015 -0.036 0.0005 -0.0065 -0.035 0.024 0.012 -0.025 -0.016 -0.019 -0.027 0.027 0.027 -0.0096 -0.019 -0.021 0.022 -0.023 -0.0045 0.02 0.0037 0.0024 0.025 0.037 -0.011 -0.018 -0.016 -0.04 -0.0023 -0.013 0.021 -0.0056 -0.01 -0.0092 -0.025 -0.002 -0.011 0.01 -0.01 -0.038 -0.0032 -0.011 1.0 0.0038 0.008 -0.014 -0.018 -0.016 -0.039 -0.0046 -0.015 0.029 -0.0073 -0.0089 -0.008 -0.026 0.0079 -0.012 0.014 -0.0096 -0.041 -0.0037 -0.012 0.94 0.01 0.0083 0.029
C63PINBOED -0.017 0.0081 0.013 -0.014 -0.017 -0.002 0.0014 -0.012 0.006 -0.0087 0.019 0.0037 -0.0027 -0.011 0.0091 0.022 0.021 -0.03 0.022 0.0063 -0.02 0.0074 -0.0034 -0.021 0.025 0.0024 -0.014 -0.007 -0.0097 -0.0083 0.0081 0.032 -0.00055 -0.027 -0.015 0.016 -0.0093 -0.0088 0.016 0.0047 0.019 0.014 0.0019 0.04 0.029 -0.0042 0.015 -0.0069 0.0075 -0.003 0.016 -0.0076 -0.0044 -0.02 0.022 -0.005 0.011 -0.0048 0.033 0.018 0.038 0.0038 1.0 0.032 0.036 0.023 -0.0052 0.029 -0.0062 0.0046 -0.0027 0.026 -0.0072 -0.0038 -0.02 0.027 -0.0056 0.015 -0.0046 0.026 0.035 0.025 0.00043 0.88 0.026 0.0055
C64PBYSTAND -0.059 -0.0038 0.035 -0.019 -0.057 0.00045 0.019 0.0098 -0.032 0.036 -0.011 -0.041 -0.037 -0.00021 0.023 0.037 0.036 -0.055 0.021 0.0053 -0.027 0.037 -0.048 -0.0059 0.019 0.01 0.035 -0.036 -0.042 -0.042 0.042 0.03 0.0092 -0.044 -0.036 0.036 -0.052 -0.0023 0.036 0.029 0.018 0.05 0.071 0.044 0.0011 0.0035 0.078 0.018 0.021 -0.0045 -0.011 0.029 -0.0066 -0.017 0.04 -0.0076 0.13 -0.0073 0.061 -0.0023 0.01 0.008 0.032 1.0 0.04 -0.00029 0.0068 0.095 0.01 0.018 -0.0041 -0.012 0.026 -0.0058 -0.015 0.02 -0.0085 0.13 -0.007 0.041 -0.0026 0.022 0.0084 0.027 0.97 0.063
C65AWAPART -0.033 0.043 -0.044 -0.014 -0.042 0.046 -0.014 0.017 -0.018 -0.043 0.014 0.041 0.047 -0.018 -0.033 0.044 0.0051 -0.035 0.029 0.0011 -0.069 0.022 -0.003 -0.0094 0.0032 0.024 -0.0053 -0.023 0.0094 -0.0072 0.0071 0.016 -0.039 0.0063 -0.014 0.012 0.0045 -0.012 -0.0059 0.026 0.01 0.015 0.0032 0.98 -0.04 -0.11 0.14 -0.041 0.014 -0.023 -0.028 -0.076 -0.028 -0.15 0.12 -0.02 0.055 -0.0095 0.49 0.016 -0.00042 -0.014 0.036 0.04 1.0 -0.046 -0.11 0.13 -0.039 0.012 -0.024 -0.026 -0.075 -0.027 -0.15 0.13 -0.012 0.055 -0.0085 0.53 0.012 -0.0047 -0.0059 0.04 0.042 0.089
C66AWABEDR -0.0041 0.0047 0.035 -0.0082 -0.00069 -0.027 0.025 -0.01 -0.009 0.018 -0.0026 -0.017 -0.022 -0.00064 0.019 0.005 0.017 -0.0074 -0.00069 0.0029 0.038 -0.0065 0.015 -0.028 0.015 -0.015 0.03 -0.011 -0.027 -0.035 0.035 0.011 0.026 -0.021 -0.007 0.0067 -0.0014 -0.0062 0.0021 0.0039 0.0036 0.017 0.029 -0.045 0.9 0.028 -0.022 0.19 -0.013 0.11 0.098 0.077 0.13 -0.029 0.02 -0.0072 -0.0087 0.19 0.082 -0.0022 -0.0076 -0.018 0.023 -0.00029 -0.046 1.0 0.029 -0.02 0.18 -0.014 0.098 0.1 0.07 0.13 -0.029 0.0082 -0.0081 -0.0089 0.19 0.052 -0.0025 -0.0081 -0.017 0.019 -0.0024 -0.00076
C67AWALAND 0.08 -0.028 0.036 0.0075 0.09 -0.055 0.04 -0.017 -0.017 0.01 -0.0013 -0.01 -0.031 0.01 0.019 -0.064 -0.019 0.054 -0.015 0.0019 0.18 -0.047 -0.024 0.0023 -0.0017 -0.0044 0.032 -0.005 -0.022 -0.054 0.054 -0.0084 0.039 -0.029 -0.013 0.014 -0.02 0.021 0.0058 -0.0037 -0.014 0.0072 -0.016 -0.11 0.024 0.99 0.075 0.026 -0.0068 -0.0056 0.094 0.57 0.13 -0.01 0.0049 0.048 0.014 0.049 0.23 -0.0029 -0.0056 -0.016 -0.0052 0.0068 -0.11 0.029 1.0 0.091 0.016 -0.0049 -0.0052 0.082 0.55 0.1 -0.0066 -0.00062 0.039 0.018 0.054 0.13 -0.0033 0.0041 -0.016 0.00069 0.0029 -0.021
C68APERSAUT -0.0083 -0.029 0.021 -0.016 -0.0056 -0.0027 0.028 -0.025 -0.024 0.028 -0.00025 -0.033 -0.044 0.0019 0.029 0.0065 0.01 -0.0094 0.0099 -0.0042 -0.031 0.014 -0.011 -0.012 0.012 -0.014 0.0056 0.0039 -0.032 -0.021 0.021 0.047 0.0044 -0.057 -0.019 0.018 -0.037 0.017 0.026 -0.0087 -0.023 0.027 0.034 0.14 -0.0086 0.091 0.92 0.039 0.073 0.028 0.046 0.098 0.036 -0.17 0.073 0.0041 0.049 -1.4e-05 0.1 0.0012 0.044 -0.039 0.029 0.095 0.13 -0.02 0.091 1.0 0.039 0.065 0.03 0.045 0.085 0.038 -0.16 0.093 0.0061 0.045 0.003 0.026 -0.0086 0.046 -0.03 0.032 0.1 0.14
C69ABESAUT -0.028 -0.019 -0.009 0.022 -0.029 0.0073 0.0098 0.0062 -0.019 -0.011 -0.0081 0.02 0.018 0.00077 -0.017 -0.016 0.0094 0.0029 -0.00044 0.0046 -0.0011 0.0051 -0.013 0.015 -0.0091 0.0059 0.0057 -0.016 0.026 -0.0092 0.0089 -0.019 0.0052 0.022 0.0021 -0.0023 0.027 -0.02 -0.0032 -0.013 -0.0057 -0.017 0.007 -0.039 0.31 0.015 0.019 0.9 0.018 0.21 0.17 0.058 0.28 -0.021 0.013 -0.0053 -0.0064 0.037 0.031 -0.0016 -0.0056 -0.0046 -0.0062 0.01 -0.039 0.18 0.016 0.039 1.0 0.014 0.17 0.18 0.044 0.38 -0.021 0.029 -0.0059 -0.0065 0.029 0.00053 -0.0018 -0.0059 -0.0059 -0.007 0.012 -0.0092
C70AMOTSCO -0.013 -0.019 0.013 -0.016 -0.012 0.0094 -0.003 0.0017 -0.0037 0.013 0.019 -0.018 -0.011 -0.00049 0.013 -0.017 0.0077 0.0053 0.0063 -0.022 -0.0049 0.014 0.00063 -0.0036 -0.0097 0.012 0.0018 -0.0083 0.0086 -0.022 0.022 0.011 0.004 -0.0081 0.017 -0.016 0.0064 0.017 -0.015 0.0024 -0.006 -0.0065 0.017 0.017 -0.012 -0.0062 0.054 0.026 0.9 -0.0069 0.0035 -0.0051 -0.01 -0.039 0.045 0.0097 -0.0064 0.00075 0.0029 -0.0035 -0.0014 -0.015 0.0046 0.018 0.012 -0.014 -0.0049 0.065 0.014 1.0 -0.0064 0.006 -0.0064 -0.0089 -0.039 0.039 0.018 -0.0052 -0.0011 -0.019 -0.0041 -0.004 -0.013 0.0092 0.022 0.0086
C71AVRAAUT 0.019 -0.0097 0.00061 -0.013 0.022 -0.011 0.025 -0.0051 -0.018 0.0095 0.0099 -0.011 -0.0023 0.014 -0.0094 -0.01 0.015 -0.0053 0.0082 -0.011 0.032 -0.001 -0.011 -0.0016 0.011 0.0084 0.00066 -0.0026 -0.006 -0.014 0.014 -0.00092 0.018 -0.013 -0.005 0.0049 -0.018 0.023 0.0035 -0.00082 -0.013 -0.0046 -0.0042 -0.023 0.13 -0.0051 0.006 0.18 -0.0069 0.95 0.086 0.063 0.07 -0.0094 -0.0077 -0.0023 -0.0028 0.041 0.011 -0.0007 -0.0025 0.029 -0.0027 -0.0041 -0.024 0.098 -0.0052 0.03 0.17 -0.0064 1.0 0.083 0.075 0.042 -0.0094 -0.0072 -0.0026 -0.0029 0.033 -0.0069 -0.00081 -0.0026 0.021 -0.0031 -0.0042 -0.009
C72AAANHANG 0.03 -0.01 0.015 0.0061 0.034 -0.0039 0.016 -0.016 -0.005 0.015 -0.0078 -0.0097 -0.01 0.014 -0.002 -0.033 -0.0013 0.023 -0.0086 -0.012 0.066 -0.0034 -0.017 0.01 -0.011 0.0028 0.023 -0.0031 -0.011 -0.026 0.025 -0.013 0.034 -0.0094 -0.0057 0.0061 -0.016 0.014 -0.0081 0.021 -0.002 0.015 -0.016 -0.025 0.14 0.076 0.031 0.15 0.0094 0.13 0.97 0.066 0.13 -0.0079 -0.0019 0.0065 0.0063 0.059 0.053 0.029 0.053 -0.0073 0.026 -0.012 -0.026 0.1 0.082 0.045 0.18 0.006 0.083 1.0 0.06 0.19 -0.011 0.0051 0.011 0.0089 0.047 0.033 0.058 0.026 -0.0085 0.037 -0.012 0.0094
C73ATRACTOR 0.092 -0.028 0.059 -0.0055 0.11 -0.044 0.043 -0.029 -0.014 0.021 -0.006 -0.014 -0.034 -6.4e-05 0.031 -0.058 -0.0012 0.035 -0.0061 0.0074 0.2 -0.058 -0.029 -0.011 0.019 -0.0059 0.045 -0.033 -0.037 -0.059 0.06 -0.022 0.063 -0.038 -0.033 0.035 -0.011 -0.0042 0.029 -0.016 -0.028 0.0023 -0.015 -0.074 0.061 0.55 0.071 0.049 -0.0082 0.065 0.077 0.93 0.25 0.006 0.00065 0.059 0.026 0.082 0.2 -0.0028 -0.0018 -0.0089 -0.0072 0.026 -0.075 0.07 0.55 0.085 0.044 -0.0064 0.075 0.06 1.0 0.18 0.0059 -0.0095 0.049 0.024 0.075 0.11 -0.0032 -0.0016 -0.011 -0.0043 0.019 -0.017
C74AWERKT 0.02 -0.0068 0.027 0.0056 0.022 -0.0042 0.0076 0.00066 -0.0081 0.015 -0.026 -0.006 -0.0046 0.0015 0.005 -0.03 -0.0076 0.025 -0.021 -0.013 0.05 -0.0033 -0.0031 0.0098 -0.016 0.023 0.011 -0.0067 -0.015 -0.015 0.015 -0.016 0.017 0.0082 0.0028 -0.003 0.019 -0.0068 -0.0096 -0.015 -0.018 -0.015 0.00034 -0.026 0.22 0.1 0.025 0.23 -0.0097 0.066 0.18 0.18 0.91 -0.003 0.014 0.13 -0.004 0.063 0.053 -0.00098 -0.0034 -0.008 -0.0038 -0.0058 -0.027 0.13 0.1 0.038 0.38 -0.0089 0.042 0.19 0.18 1.0 -0.0028 -0.0028 0.072 -0.004 0.051 0.021 -0.0011 -0.0037 -0.0075 -0.0043 -0.0059 -0.013
C75ABROM 0.032 -0.013 0.019 0.00047 0.036 -0.024 0.016 -0.025 0.0085 0.015 0.0018 -0.019 -0.031 0.012 0.017 -0.041 -0.031 0.047 -0.039 -0.017 0.045 -0.028 0.021 0.032 -0.03 -0.028 0.011 0.031 0.012 -0.012 0.012 -0.009 0.028 -0.0058 0.016 -0.016 0.0099 0.011 -0.014 -0.024 -0.0048 -0.015 -0.0077 -0.15 -0.029 -0.0057 -0.18 -0.024 -0.042 -0.01 -0.011 -0.001 0.0018 0.97 -0.041 -0.017 -0.0043 -0.017 -0.17 -0.0052 -0.018 -0.026 -0.02 -0.015 -0.15 -0.029 -0.0066 -0.16 -0.021 -0.039 -0.0094 -0.011 0.0059 -0.0028 1.0 -0.042 -0.019 -0.0054 -0.016 -0.18 -0.006 -0.02 -0.028 -0.023 -0.015 -0.045
C76ALEVEN -0.029 0.021 0.017 -0.025 -0.027 0.0038 -0.018 0.0031 0.01 0.015 0.0037 -0.016 -0.014 -0.02 0.025 0.014 0.03 -0.03 0.016 -0.014 -0.0043 0.0059 0.0042 -0.015 0.023 -0.0023 0.018 -0.024 -0.0098 -0.026 0.026 0.034 -0.0068 -0.031 -0.029 0.029 -0.014 -0.0078 0.024 0.00076 0.0005 0.018 0.054 0.13 0.018 0.0028 0.077 0.048 0.032 -0.0078 -0.00074 -0.01 -0.0036 -0.042 0.85 0.013 0.11 -0.0054 0.13 -0.004 0.0026 0.0079 0.027 0.02 0.13 0.0082 -0.00062 0.093 0.029 0.039 -0.0072 0.0051 -0.0095 -0.0028 -0.042 1.0 0.016 0.12 -0.0063 0.092 -0.0046 -0.0038 0.0039 0.033 0.018 0.037
C77APERSONG 0.022 0.0033 -0.00013 0.021 0.024 -0.0084 -0.014 0.0066 0.0088 0.012 -0.021 -0.0069 0.0019 -0.0046 0.00082 -0.012 -0.0039 0.011 0.0082 -0.01 0.017 -0.011 0.0098 -0.0091 0.0093 -0.0032 0.0057 -0.0031 0.0071 0.0051 -0.0053 -0.013 0.0063 0.0048 -0.02 0.02 -0.0054 0.021 -0.00082 -0.014 -0.0098 -0.013 -0.0098 -0.0096 -0.0081 0.046 0.0072 -0.0066 0.028 -0.0028 0.015 0.055 0.058 -0.019 0.039 0.9 -0.0058 -0.0046 0.014 -0.0014 0.0036 -0.012 -0.0056 -0.0085 -0.012 -0.0081 0.039 0.0061 -0.0059 0.018 -0.0026 0.011 0.049 0.072 -0.019 0.016 1.0 -0.0059 -0.0044 0.014 -0.0017 0.024 -0.011 -0.0064 -0.0087 -0.0085
C78AGEZONG -0.012 0.0094 0.017 -0.0017 -0.01 0.0054 0.0039 -0.0056 0.0042 0.022 -0.0079 -0.024 -0.033 0.0095 0.014 0.0046 0.00079 -0.0063 0.0071 -0.014 0.0064 -0.0025 0.0045 -0.013 -0.00073 0.011 -0.00097 0.0013 -0.021 -0.022 0.024 0.046 -0.016 -0.035 -0.0027 0.0036 -0.019 -0.017 0.036 0.023 -0.01 0.023 0.032 0.057 -0.0089 0.018 0.046 -0.0074 -0.0063 -0.0031 0.002 0.023 -0.0046 -0.0057 0.13 -0.0053 0.98 -0.0051 0.059 -0.0016 -0.0056 0.014 0.015 0.13 0.055 -0.0089 0.018 0.045 -0.0065 -0.0052 -0.0029 0.0089 0.024 -0.004 -0.0054 0.12 -0.0059 1.0 -0.0049 0.039 -0.0018 -0.006 0.038 0.017 0.11 0.034
C79AWAOREG -0.0029 -0.016 0.019 0.00064 -0.0022 -0.013 0.004 0.018 -0.0041 0.012 -0.011 -0.0037 -0.0049 -0.0058 0.0098 -0.01 0.0044 0.0044 0.0035 0.012 0.038 -0.0015 -0.0076 -0.016 0.00031 0.013 0.017 -0.017 -0.024 -0.028 0.028 -0.0058 0.023 -0.0068 0.0017 -0.0019 0.0069 0.023 -0.02 -0.014 -0.0059 -0.014 0.016 -0.0019 0.21 0.062 -0.0047 0.045 0.0031 0.052 0.046 0.079 0.074 -0.016 -0.0031 -0.0039 -0.0048 0.95 0.074 -0.0012 -0.0041 -0.0096 -0.0046 -0.007 -0.0085 0.19 0.054 0.003 0.029 -0.0011 0.033 0.047 0.075 0.051 -0.016 -0.0063 -0.0044 -0.0049 1.0 0.034 -0.0014 -0.0044 -0.009 -0.0052 -0.0071 0.022
C80ABRAND -0.012 0.00065 -0.0093 0.024 -0.012 0.0019 0.046 0.0063 -0.053 -0.008 -0.00047 0.013 0.012 0.0099 -0.021 0.016 -0.0093 -0.004 0.041 0.021 0.016 -0.0008 -0.021 -0.02 0.021 0.0032 0.0042 -0.019 -0.011 -0.082 0.082 -0.0087 0.021 -0.0089 -0.026 0.026 -0.025 0.0042 0.0083 0.022 0.023 0.021 0.024 0.52 0.061 0.13 0.018 0.0033 -0.018 -0.0042 0.034 0.11 0.028 -0.19 0.085 0.0079 0.04 0.028 0.87 0.0081 0.0048 -0.041 0.026 0.041 0.53 0.052 0.13 0.026 0.00053 -0.019 -0.0069 0.033 0.11 0.021 -0.18 0.092 0.014 0.039 0.034 1.0 0.0039 0.00018 -0.031 0.026 0.035 0.061
C81AZEILPL 0.0078 -0.0062 0.0092 0.00024 0.0071 -0.00068 0.014 -0.024 -0.0084 0.0097 -0.021 -0.0038 -0.011 0.011 0.0041 0.017 -0.0045 -0.0057 0.014 0.027 0.017 -0.015 0.0015 -0.018 0.027 0.001 -0.008 -0.013 -0.019 -0.024 0.024 0.0092 0.019 -0.028 -0.041 0.041 -0.0099 0.0016 -0.00076 0.01 -0.0083 0.0095 -0.0064 0.013 -0.0025 -0.0033 -0.0075 -0.0021 -0.0044 -0.00088 0.069 -0.0035 -0.0013 -0.006 -0.0049 -0.0015 -0.0018 -0.0014 0.01 0.87 0.16 -0.0037 0.035 -0.0026 0.012 -0.0025 -0.0033 -0.0086 -0.0018 -0.0041 -0.00081 0.058 -0.0032 -0.0011 -0.006 -0.0046 -0.0017 -0.0018 -0.0014 0.0039 1.0 0.091 -0.0034 0.082 -0.0027 0.026
C82APLEZIER -0.018 0.00067 0.00064 -0.0018 -0.021 0.012 0.018 0.0094 -0.026 -0.0016 0.02 -0.0063 -0.019 0.022 -0.01 0.0076 0.02 -0.019 0.0043 0.019 -0.0065 0.0075 -0.0021 -0.013 0.011 0.0075 -0.0042 -0.0082 0.001 -0.026 0.026 -0.01 0.0034 0.0032 -0.01 0.01 -0.012 0.0047 0.011 -0.0034 0.0073 0.012 0.02 -0.0022 -0.0081 0.0021 0.032 -0.0067 -0.00032 -0.0028 0.032 -0.00084 -0.0042 -0.019 0.0075 0.015 -0.0059 -0.0046 0.013 0.047 0.9 -0.012 0.025 0.022 -0.0047 -0.0081 0.0041 0.046 -0.0059 -0.004 -0.0026 0.026 -0.0016 -0.0037 -0.02 -0.0038 0.024 -0.006 -0.0044 0.00018 0.091 1.0 -0.011 0.04 0.026 0.11
C83AFIETS -0.016 -0.021 0.03 0.021 -0.018 -0.0015 0.0019 0.026 -0.011 0.021 -0.037 -0.0079 -0.01 -0.0033 0.0083 0.04 0.01 -0.035 0.036 0.024 -0.029 -0.0019 -0.01 -0.037 0.031 0.0066 -0.019 -0.023 -0.02 -0.03 0.03 0.029 -0.0092 -0.024 -0.024 0.025 -0.025 -0.0079 0.026 0.0047 0.00074 0.033 0.041 -0.0023 -0.017 -0.017 -0.032 -0.0045 -0.011 0.014 -0.0072 -0.012 -0.0086 -0.027 -0.0055 -0.0099 0.03 -0.0094 -0.027 -0.003 -0.01 0.94 0.00043 0.0084 -0.0059 -0.017 -0.016 -0.03 -0.0059 -0.013 0.021 -0.0085 -0.011 -0.0075 -0.028 0.0039 -0.011 0.038 -0.009 -0.031 -0.0034 -0.011 1.0 0.0048 0.0092 0.034
C84AINBOED -0.021 0.018 0.026 -0.02 -0.021 -0.011 0.0024 -0.0097 0.0073 -0.0034 0.018 -0.0026 -0.014 -0.01 0.017 0.016 0.018 -0.023 0.015 0.011 -0.02 0.012 -0.0012 -0.021 0.018 -0.0013 -0.017 -0.00088 -0.01 -0.012 0.012 0.034 0.0054 -0.033 -0.0055 0.0062 -0.018 -0.0057 0.023 0.00062 0.02 0.016 0.013 0.045 0.022 0.0027 0.013 -0.0079 0.013 -0.0034 0.027 -0.004 -0.005 -0.023 0.028 -0.0057 0.013 -0.0055 0.037 0.042 0.064 0.01 0.88 0.027 0.04 0.019 0.00069 0.032 -0.007 0.0092 -0.0031 0.037 -0.0043 -0.0043 -0.023 0.033 -0.0064 0.017 -0.0052 0.026 0.082 0.04 0.0048 1.0 0.021 0.018
C85ABYSTAND -0.054 -0.0042 0.028 -0.015 -0.052 -0.004 0.017 0.01 -0.027 0.037 -0.014 -0.04 -0.034 0.0017 0.02 0.04 0.031 -0.053 0.022 0.011 -0.029 0.034 -0.048 -0.0046 0.023 0.01 0.025 -0.033 -0.04 -0.037 0.038 0.033 0.0021 -0.044 -0.035 0.035 -0.055 0.00038 0.037 0.028 0.013 0.049 0.064 0.046 -0.0013 0.00016 0.085 0.022 0.025 -0.0046 -0.012 0.022 -0.0068 -0.017 0.04 -0.0078 0.12 -0.0075 0.054 -0.0023 0.013 0.0083 0.026 0.97 0.042 -0.0024 0.0029 0.1 0.012 0.022 -0.0042 -0.012 0.019 -0.0059 -0.015 0.018 -0.0087 0.11 -0.0071 0.035 -0.0027 0.026 0.0092 0.021 1.0 0.067
C86CARAVAN -0.069 -0.0098 0.036 0.0045 -0.069 0.0062 0.033 0.004 -0.042 0.07 -0.033 -0.062 -0.053 0.008 0.032 0.085 0.044 -0.091 0.065 0.022 -0.054 0.048 -0.042 -0.055 0.063 0.029 0.004 -0.042 -0.063 -0.079 0.078 0.07 0.0078 -0.077 -0.058 0.058 -0.08 -0.00098 0.058 0.058 -0.002 0.09 0.096 0.096 -0.0019 -0.022 0.15 -0.0065 0.0056 -0.0097 0.013 -0.012 -0.014 -0.044 0.021 -0.0096 0.04 0.031 0.094 0.012 0.091 0.029 0.0055 0.063 0.089 -0.00076 -0.021 0.14 -0.0092 0.0086 -0.009 0.0094 -0.017 -0.013 -0.045 0.037 -0.0085 0.034 0.022 0.061 0.026 0.11 0.034 0.018 0.067 1.0

1.2.3.2 Data Feature correlation between C44-C85

  • C44-C85:Since in this zone, there are three red line run through.
In [6]:
Train[["C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT", 
                                       "C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT", 
                                       "C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
                                       "C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART", "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT", 
                                       "C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
                                       "C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
                                       "C84AINBOED", "C85ABYSTAND"]].corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
Out[6]:
C44PWAPART C45PWABEDR C46PWALAND C47PPERSAUT C48PBESAUT C49PMOTSCO C50PVRAAUT C51PAANHANG C52PTRACTOR C53PWERKT C54PBROM C55PLEVEN C56PPERSONG C57PGEZONG C58PWAOREG C59PBRAND C60PZEILPL C61PPLEZIER C62PFIETS C63PINBOED C64PBYSTAND C65AWAPART C66AWABEDR C67AWALAND C68APERSAUT C69ABESAUT C70AMOTSCO C71AVRAAUT C72AAANHANG C73ATRACTOR C74AWERKT C75ABROM C76ALEVEN C77APERSONG C78AGEZONG C79AWAOREG C80ABRAND C81AZEILPL C82APLEZIER C83AFIETS C84AINBOED C85ABYSTAND
C44PWAPART 1.0 -0.038 -0.11 0.15 -0.041 0.019 -0.022 -0.026 -0.075 -0.027 -0.15 0.13 -0.019 0.057 -0.0017 0.48 0.017 0.0021 -0.011 0.04 0.044 0.98 -0.045 -0.11 0.14 -0.039 0.017 -0.023 -0.025 -0.074 -0.026 -0.15 0.13 -0.0096 0.057 -0.0019 0.52 0.013 -0.0022 -0.0023 0.045 0.046
C45PWABEDR -0.038 1.0 0.022 -0.0088 0.29 -0.0089 0.15 0.13 0.066 0.2 -0.029 0.035 -0.0072 -0.0088 0.2 0.11 -0.0022 -0.0076 -0.018 0.029 0.0011 -0.04 0.9 0.024 -0.0086 0.31 -0.012 0.13 0.14 0.061 0.22 -0.029 0.018 -0.0081 -0.0089 0.21 0.061 -0.0025 -0.0081 -0.017 0.022 -0.0013
C46PWALAND -0.11 0.022 1.0 0.074 0.024 -0.0081 -0.0055 0.088 0.57 0.12 -0.0096 0.0079 0.056 0.014 0.057 0.23 -0.0028 -0.0061 -0.016 -0.0042 0.0035 -0.11 0.028 0.99 0.091 0.015 -0.0062 -0.0051 0.076 0.55 0.1 -0.0057 0.0028 0.046 0.018 0.062 0.13 -0.0033 0.0021 -0.017 0.0027 0.00016
C47PPERSAUT 0.15 -0.0088 0.074 1.0 0.014 0.057 0.0076 0.028 0.078 0.021 -0.19 0.071 0.0001 0.048 -0.0074 0.091 0.0043 0.025 -0.04 0.015 0.078 0.14 -0.022 0.075 0.92 0.019 0.054 0.006 0.031 0.071 0.025 -0.18 0.077 0.0072 0.046 -0.0047 0.018 -0.0075 0.032 -0.032 0.013 0.085
C48PBESAUT -0.041 0.29 0.024 0.014 1.0 0.032 0.22 0.14 0.055 0.21 -0.024 0.026 -0.006 -0.0072 0.056 0.034 -0.0018 -0.0063 -0.0023 -0.0069 0.018 -0.041 0.19 0.026 0.039 0.9 0.026 0.18 0.15 0.049 0.23 -0.024 0.048 -0.0066 -0.0074 0.045 0.0033 -0.0021 -0.0067 -0.0045 -0.0079 0.022
C49PMOTSCO 0.019 -0.0089 -0.0081 0.057 0.032 1.0 -0.0075 0.0059 -0.0072 -0.011 -0.042 0.039 0.018 -0.0076 0.0061 0.0038 -0.0039 0.0033 -0.013 0.0075 0.021 0.014 -0.013 -0.0068 0.073 0.018 0.9 -0.0069 0.0094 -0.0082 -0.0097 -0.042 0.032 0.028 -0.0063 0.0031 -0.018 -0.0044 -0.00032 -0.011 0.013 0.025
C50PVRAAUT -0.022 0.15 -0.0055 0.0076 0.22 -0.0075 1.0 0.13 0.064 0.11 -0.01 -0.0084 -0.0025 -0.0031 0.065 0.015 -0.00076 -0.0027 0.021 -0.003 -0.0045 -0.023 0.11 -0.0056 0.028 0.21 -0.0069 0.95 0.13 0.065 0.066 -0.01 -0.0078 -0.0028 -0.0031 0.052 -0.0042 -0.00088 -0.0028 0.014 -0.0034 -0.0046
C51PAANHANG -0.026 0.13 0.088 0.028 0.14 0.0059 0.13 1.0 0.083 0.12 -0.0072 -0.007 0.009 0.00057 0.058 0.057 0.035 0.064 -0.0056 0.016 -0.011 -0.028 0.098 0.094 0.046 0.17 0.0035 0.086 0.97 0.077 0.18 -0.011 -0.00074 0.015 0.002 0.046 0.034 0.069 0.032 -0.0072 0.027 -0.012
C52PTRACTOR -0.075 0.066 0.57 0.078 0.055 -0.0072 0.064 0.083 1.0 0.22 -0.0024 0.00032 0.063 0.025 0.086 0.21 -0.003 -0.0012 -0.01 -0.0076 0.029 -0.076 0.077 0.57 0.098 0.058 -0.0051 0.063 0.066 0.93 0.18 -0.001 -0.01 0.055 0.023 0.079 0.11 -0.0035 -0.00084 -0.012 -0.004 0.022
C53PWERKT -0.027 0.2 0.12 0.021 0.21 -0.011 0.11 0.12 0.22 1.0 0.0015 0.014 0.097 -0.0045 0.092 0.063 -0.0011 -0.0039 -0.0092 -0.0044 -0.0066 -0.028 0.13 0.13 0.036 0.28 -0.01 0.07 0.13 0.25 0.91 0.0018 -0.0036 0.058 -0.0046 0.074 0.028 -0.0013 -0.0042 -0.0086 -0.005 -0.0068
C54PBROM -0.15 -0.029 -0.0096 -0.19 -0.024 -0.042 -0.01 -0.0072 -0.0024 0.0015 1.0 -0.041 -0.017 -0.0045 -0.017 -0.17 -0.0052 -0.018 -0.025 -0.02 -0.017 -0.15 -0.029 -0.01 -0.17 -0.021 -0.039 -0.0094 -0.0079 0.006 -0.003 0.97 -0.042 -0.019 -0.0057 -0.016 -0.19 -0.006 -0.019 -0.027 -0.023 -0.017
C55PLEVEN 0.13 0.035 0.0079 0.071 0.026 0.039 -0.0084 -0.007 0.00032 0.014 -0.041 1.0 0.035 0.13 -0.0014 0.12 -0.0043 0.019 -0.002 0.022 0.04 0.12 0.02 0.0049 0.073 0.013 0.045 -0.0077 -0.0019 0.00065 0.014 -0.041 0.85 0.039 0.13 -0.0031 0.085 -0.0049 0.0075 -0.0055 0.028 0.04
C56PPERSONG -0.019 -0.0072 0.056 0.0001 -0.006 0.018 -0.0025 0.009 0.063 0.097 -0.017 0.035 1.0 -0.0052 -0.0041 0.014 -0.0013 0.0015 -0.011 -0.005 -0.0076 -0.02 -0.0072 0.048 0.0041 -0.0053 0.0097 -0.0023 0.0065 0.059 0.13 -0.017 0.013 0.9 -0.0053 -0.0039 0.0079 -0.0015 0.015 -0.0099 -0.0057 -0.0078
C57PGEZONG 0.057 -0.0088 0.014 0.048 -0.0072 -0.0076 -0.0031 0.00057 0.025 -0.0045 -0.0045 0.13 -0.0052 1.0 -0.005 0.058 -0.0016 -0.0055 0.01 0.011 0.13 0.055 -0.0087 0.014 0.049 -0.0064 -0.0064 -0.0028 0.0063 0.026 -0.004 -0.0043 0.11 -0.0058 0.98 -0.0048 0.04 -0.0018 -0.0059 0.03 0.013 0.12
C58PWAOREG -0.0017 0.2 0.057 -0.0074 0.056 0.0061 0.065 0.058 0.086 0.092 -0.017 -0.0014 -0.0041 -0.005 1.0 0.065 -0.0012 -0.0043 -0.01 -0.0048 -0.0073 -0.0095 0.19 0.049 -1.4e-05 0.037 0.00075 0.041 0.059 0.082 0.063 -0.017 -0.0054 -0.0046 -0.0051 0.95 0.028 -0.0014 -0.0046 -0.0094 -0.0055 -0.0075
C59PBRAND 0.48 0.11 0.23 0.091 0.034 0.0038 0.015 0.057 0.21 0.063 -0.17 0.12 0.014 0.058 0.065 1.0 0.014 0.019 -0.038 0.033 0.061 0.49 0.082 0.23 0.1 0.031 0.0029 0.011 0.053 0.2 0.053 -0.17 0.13 0.014 0.059 0.074 0.87 0.01 0.013 -0.027 0.037 0.054
C60PZEILPL 0.017 -0.0022 -0.0028 0.0043 -0.0018 -0.0039 -0.00076 0.035 -0.003 -0.0011 -0.0052 -0.0043 -0.0013 -0.0016 -0.0012 0.014 1.0 0.085 -0.0032 0.018 -0.0023 0.016 -0.0022 -0.0029 0.0012 -0.0016 -0.0035 -0.0007 0.029 -0.0028 -0.00098 -0.0052 -0.004 -0.0014 -0.0016 -0.0012 0.0081 0.87 0.047 -0.003 0.042 -0.0023
C61PPLEZIER 0.0021 -0.0076 -0.0061 0.025 -0.0063 0.0033 -0.0027 0.064 -0.0012 -0.0039 -0.018 0.019 0.0015 -0.0055 -0.0043 0.019 0.085 1.0 -0.011 0.038 0.01 -0.00042 -0.0076 -0.0056 0.044 -0.0056 -0.0014 -0.0025 0.053 -0.0018 -0.0034 -0.018 0.0026 0.0036 -0.0056 -0.0041 0.0048 0.16 0.9 -0.01 0.064 0.013
C62PFIETS -0.011 -0.018 -0.016 -0.04 -0.0023 -0.013 0.021 -0.0056 -0.01 -0.0092 -0.025 -0.002 -0.011 0.01 -0.01 -0.038 -0.0032 -0.011 1.0 0.0038 0.008 -0.014 -0.018 -0.016 -0.039 -0.0046 -0.015 0.029 -0.0073 -0.0089 -0.008 -0.026 0.0079 -0.012 0.014 -0.0096 -0.041 -0.0037 -0.012 0.94 0.01 0.0083
C63PINBOED 0.04 0.029 -0.0042 0.015 -0.0069 0.0075 -0.003 0.016 -0.0076 -0.0044 -0.02 0.022 -0.005 0.011 -0.0048 0.033 0.018 0.038 0.0038 1.0 0.032 0.036 0.023 -0.0052 0.029 -0.0062 0.0046 -0.0027 0.026 -0.0072 -0.0038 -0.02 0.027 -0.0056 0.015 -0.0046 0.026 0.035 0.025 0.00043 0.88 0.026
C64PBYSTAND 0.044 0.0011 0.0035 0.078 0.018 0.021 -0.0045 -0.011 0.029 -0.0066 -0.017 0.04 -0.0076 0.13 -0.0073 0.061 -0.0023 0.01 0.008 0.032 1.0 0.04 -0.00029 0.0068 0.095 0.01 0.018 -0.0041 -0.012 0.026 -0.0058 -0.015 0.02 -0.0085 0.13 -0.007 0.041 -0.0026 0.022 0.0084 0.027 0.97
C65AWAPART 0.98 -0.04 -0.11 0.14 -0.041 0.014 -0.023 -0.028 -0.076 -0.028 -0.15 0.12 -0.02 0.055 -0.0095 0.49 0.016 -0.00042 -0.014 0.036 0.04 1.0 -0.046 -0.11 0.13 -0.039 0.012 -0.024 -0.026 -0.075 -0.027 -0.15 0.13 -0.012 0.055 -0.0085 0.53 0.012 -0.0047 -0.0059 0.04 0.042
C66AWABEDR -0.045 0.9 0.028 -0.022 0.19 -0.013 0.11 0.098 0.077 0.13 -0.029 0.02 -0.0072 -0.0087 0.19 0.082 -0.0022 -0.0076 -0.018 0.023 -0.00029 -0.046 1.0 0.029 -0.02 0.18 -0.014 0.098 0.1 0.07 0.13 -0.029 0.0082 -0.0081 -0.0089 0.19 0.052 -0.0025 -0.0081 -0.017 0.019 -0.0024
C67AWALAND -0.11 0.024 0.99 0.075 0.026 -0.0068 -0.0056 0.094 0.57 0.13 -0.01 0.0049 0.048 0.014 0.049 0.23 -0.0029 -0.0056 -0.016 -0.0052 0.0068 -0.11 0.029 1.0 0.091 0.016 -0.0049 -0.0052 0.082 0.55 0.1 -0.0066 -0.00062 0.039 0.018 0.054 0.13 -0.0033 0.0041 -0.016 0.00069 0.0029
C68APERSAUT 0.14 -0.0086 0.091 0.92 0.039 0.073 0.028 0.046 0.098 0.036 -0.17 0.073 0.0041 0.049 -1.4e-05 0.1 0.0012 0.044 -0.039 0.029 0.095 0.13 -0.02 0.091 1.0 0.039 0.065 0.03 0.045 0.085 0.038 -0.16 0.093 0.0061 0.045 0.003 0.026 -0.0086 0.046 -0.03 0.032 0.1
C69ABESAUT -0.039 0.31 0.015 0.019 0.9 0.018 0.21 0.17 0.058 0.28 -0.021 0.013 -0.0053 -0.0064 0.037 0.031 -0.0016 -0.0056 -0.0046 -0.0062 0.01 -0.039 0.18 0.016 0.039 1.0 0.014 0.17 0.18 0.044 0.38 -0.021 0.029 -0.0059 -0.0065 0.029 0.00053 -0.0018 -0.0059 -0.0059 -0.007 0.012
C70AMOTSCO 0.017 -0.012 -0.0062 0.054 0.026 0.9 -0.0069 0.0035 -0.0051 -0.01 -0.039 0.045 0.0097 -0.0064 0.00075 0.0029 -0.0035 -0.0014 -0.015 0.0046 0.018 0.012 -0.014 -0.0049 0.065 0.014 1.0 -0.0064 0.006 -0.0064 -0.0089 -0.039 0.039 0.018 -0.0052 -0.0011 -0.019 -0.0041 -0.004 -0.013 0.0092 0.022
C71AVRAAUT -0.023 0.13 -0.0051 0.006 0.18 -0.0069 0.95 0.086 0.063 0.07 -0.0094 -0.0077 -0.0023 -0.0028 0.041 0.011 -0.0007 -0.0025 0.029 -0.0027 -0.0041 -0.024 0.098 -0.0052 0.03 0.17 -0.0064 1.0 0.083 0.075 0.042 -0.0094 -0.0072 -0.0026 -0.0029 0.033 -0.0069 -0.00081 -0.0026 0.021 -0.0031 -0.0042
C72AAANHANG -0.025 0.14 0.076 0.031 0.15 0.0094 0.13 0.97 0.066 0.13 -0.0079 -0.0019 0.0065 0.0063 0.059 0.053 0.029 0.053 -0.0073 0.026 -0.012 -0.026 0.1 0.082 0.045 0.18 0.006 0.083 1.0 0.06 0.19 -0.011 0.0051 0.011 0.0089 0.047 0.033 0.058 0.026 -0.0085 0.037 -0.012
C73ATRACTOR -0.074 0.061 0.55 0.071 0.049 -0.0082 0.065 0.077 0.93 0.25 0.006 0.00065 0.059 0.026 0.082 0.2 -0.0028 -0.0018 -0.0089 -0.0072 0.026 -0.075 0.07 0.55 0.085 0.044 -0.0064 0.075 0.06 1.0 0.18 0.0059 -0.0095 0.049 0.024 0.075 0.11 -0.0032 -0.0016 -0.011 -0.0043 0.019
C74AWERKT -0.026 0.22 0.1 0.025 0.23 -0.0097 0.066 0.18 0.18 0.91 -0.003 0.014 0.13 -0.004 0.063 0.053 -0.00098 -0.0034 -0.008 -0.0038 -0.0058 -0.027 0.13 0.1 0.038 0.38 -0.0089 0.042 0.19 0.18 1.0 -0.0028 -0.0028 0.072 -0.004 0.051 0.021 -0.0011 -0.0037 -0.0075 -0.0043 -0.0059
C75ABROM -0.15 -0.029 -0.0057 -0.18 -0.024 -0.042 -0.01 -0.011 -0.001 0.0018 0.97 -0.041 -0.017 -0.0043 -0.017 -0.17 -0.0052 -0.018 -0.026 -0.02 -0.015 -0.15 -0.029 -0.0066 -0.16 -0.021 -0.039 -0.0094 -0.011 0.0059 -0.0028 1.0 -0.042 -0.019 -0.0054 -0.016 -0.18 -0.006 -0.02 -0.028 -0.023 -0.015
C76ALEVEN 0.13 0.018 0.0028 0.077 0.048 0.032 -0.0078 -0.00074 -0.01 -0.0036 -0.042 0.85 0.013 0.11 -0.0054 0.13 -0.004 0.0026 0.0079 0.027 0.02 0.13 0.0082 -0.00062 0.093 0.029 0.039 -0.0072 0.0051 -0.0095 -0.0028 -0.042 1.0 0.016 0.12 -0.0063 0.092 -0.0046 -0.0038 0.0039 0.033 0.018
C77APERSONG -0.0096 -0.0081 0.046 0.0072 -0.0066 0.028 -0.0028 0.015 0.055 0.058 -0.019 0.039 0.9 -0.0058 -0.0046 0.014 -0.0014 0.0036 -0.012 -0.0056 -0.0085 -0.012 -0.0081 0.039 0.0061 -0.0059 0.018 -0.0026 0.011 0.049 0.072 -0.019 0.016 1.0 -0.0059 -0.0044 0.014 -0.0017 0.024 -0.011 -0.0064 -0.0087
C78AGEZONG 0.057 -0.0089 0.018 0.046 -0.0074 -0.0063 -0.0031 0.002 0.023 -0.0046 -0.0057 0.13 -0.0053 0.98 -0.0051 0.059 -0.0016 -0.0056 0.014 0.015 0.13 0.055 -0.0089 0.018 0.045 -0.0065 -0.0052 -0.0029 0.0089 0.024 -0.004 -0.0054 0.12 -0.0059 1.0 -0.0049 0.039 -0.0018 -0.006 0.038 0.017 0.11
C79AWAOREG -0.0019 0.21 0.062 -0.0047 0.045 0.0031 0.052 0.046 0.079 0.074 -0.016 -0.0031 -0.0039 -0.0048 0.95 0.074 -0.0012 -0.0041 -0.0096 -0.0046 -0.007 -0.0085 0.19 0.054 0.003 0.029 -0.0011 0.033 0.047 0.075 0.051 -0.016 -0.0063 -0.0044 -0.0049 1.0 0.034 -0.0014 -0.0044 -0.009 -0.0052 -0.0071
C80ABRAND 0.52 0.061 0.13 0.018 0.0033 -0.018 -0.0042 0.034 0.11 0.028 -0.19 0.085 0.0079 0.04 0.028 0.87 0.0081 0.0048 -0.041 0.026 0.041 0.53 0.052 0.13 0.026 0.00053 -0.019 -0.0069 0.033 0.11 0.021 -0.18 0.092 0.014 0.039 0.034 1.0 0.0039 0.00018 -0.031 0.026 0.035
C81AZEILPL 0.013 -0.0025 -0.0033 -0.0075 -0.0021 -0.0044 -0.00088 0.069 -0.0035 -0.0013 -0.006 -0.0049 -0.0015 -0.0018 -0.0014 0.01 0.87 0.16 -0.0037 0.035 -0.0026 0.012 -0.0025 -0.0033 -0.0086 -0.0018 -0.0041 -0.00081 0.058 -0.0032 -0.0011 -0.006 -0.0046 -0.0017 -0.0018 -0.0014 0.0039 1.0 0.091 -0.0034 0.082 -0.0027
C82APLEZIER -0.0022 -0.0081 0.0021 0.032 -0.0067 -0.00032 -0.0028 0.032 -0.00084 -0.0042 -0.019 0.0075 0.015 -0.0059 -0.0046 0.013 0.047 0.9 -0.012 0.025 0.022 -0.0047 -0.0081 0.0041 0.046 -0.0059 -0.004 -0.0026 0.026 -0.0016 -0.0037 -0.02 -0.0038 0.024 -0.006 -0.0044 0.00018 0.091 1.0 -0.011 0.04 0.026
C83AFIETS -0.0023 -0.017 -0.017 -0.032 -0.0045 -0.011 0.014 -0.0072 -0.012 -0.0086 -0.027 -0.0055 -0.0099 0.03 -0.0094 -0.027 -0.003 -0.01 0.94 0.00043 0.0084 -0.0059 -0.017 -0.016 -0.03 -0.0059 -0.013 0.021 -0.0085 -0.011 -0.0075 -0.028 0.0039 -0.011 0.038 -0.009 -0.031 -0.0034 -0.011 1.0 0.0048 0.0092
C84AINBOED 0.045 0.022 0.0027 0.013 -0.0079 0.013 -0.0034 0.027 -0.004 -0.005 -0.023 0.028 -0.0057 0.013 -0.0055 0.037 0.042 0.064 0.01 0.88 0.027 0.04 0.019 0.00069 0.032 -0.007 0.0092 -0.0031 0.037 -0.0043 -0.0043 -0.023 0.033 -0.0064 0.017 -0.0052 0.026 0.082 0.04 0.0048 1.0 0.021
C85ABYSTAND 0.046 -0.0013 0.00016 0.085 0.022 0.025 -0.0046 -0.012 0.022 -0.0068 -0.017 0.04 -0.0078 0.12 -0.0075 0.054 -0.0023 0.013 0.0083 0.026 0.97 0.042 -0.0024 0.0029 0.1 0.012 0.022 -0.0042 -0.012 0.019 -0.0059 -0.015 0.018 -0.0087 0.11 -0.0071 0.035 -0.0027 0.026 0.0092 0.021 1.0

1.3 Value Counts of CARAVAN

It is clear that our dataset is highly unbalanced with only 6.36% of observations actually buying the insurance.

In [13]:
from __future__ import division

#y_Train_original.value_counts().plot(kind='bar', title='Classifying CARAVAN 2', color='steelblue', grid=True)
#print "y_Train: \n", y_Train_original.value_counts()
#print "\n"

#y_TestdropM2.value_counts().plot(kind='bar', title='Classifying CARAVAN', color='steelblue', grid=True)
#print "y_Test: \n",y_Test.value_counts()
#print "\n"
X = Train.drop(['C86CARAVAN'], axis=1)
y = Train['C86CARAVAN']

plt.subplot(1,1,1)
y.value_counts().plot(kind='bar', title='Classifying CARAVAN: all Data', color='steelblue', grid=True)
print "y: \n",y.value_counts()
print("Caravan Ratio: {:.2%}".format(348/5474))
#print float(format(), .02%)
y: 
0    5474
1     348
Name: C86CARAVAN, dtype: int64
Caravan Ratio: 6.36%

2. Business Understanding

Determination of target customers for Caravan insurance. This is a Cross selling problem: Cross-selling is the action or practice of selling an additional product or service to an existing customer (https://en.wikipedia.org/wiki/Cross-selling).

  • Benefit Item: Customer who would buy Caravan
  • To translate it into the data understanding, we care more about, if we could predict the "1" class correctly or not. Because the profit Item are "1" class. But as shown in 1.3, it is clear that our dataset is highly unbalanced with only 6.36% of observations actually buying the insurance

3. Preprocessing

3.1 Feature initial selection

  • TraindropM: Apart from 'Purchasing Power Class', all sociodemographic variables derived from zip codes were discarded, because they did not add predictive power to the model.
  • TraindropM2: since the "Contribution of policy" of group C44-C64 and "Number of policy" of groups are highly correleted, so I would generate a second TraindropM2 datasets, which includes one of the groups. I choose to keep the "Contribution" group, since "number" are not a measure for the real value, for example, 10 car policies for VW may has less value that 1 car policy for one Lamborghini Veneno Roadster.
In [8]:
#  Apart  from  'Purchasing  Power  Class',  all  sociodemographic  variables  derived  from  zip  codeswere  discarded,  because  they  did  not  add  predictive  power  to  the  model.except for "C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD","C42MINKGEM","C43MKOOPKLA" 
TraindropM = Train[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT", 
                                       "C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT", 
                                       "C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
                                       "C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART", 
                                       "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT", 
                                       "C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
                                       "C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
                                       "C84AINBOED", "C85ABYSTAND", "C86CARAVAN"]]

print "first 5 rows of TraindropM are:\n", TraindropM.head()
print TraindropM.describe()

TraindropM2 = Train[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT", 
                                       "C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT", 
                                       "C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
                                       "C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C86CARAVAN"]]

print "first 5 rows of TraindropM are:\n", TraindropM.head()
first 5 rows of TraindropM are:
   C43MKOOPKLA  C44PWAPART  C45PWABEDR  C46PWALAND  C47PPERSAUT  C48PBESAUT  \
0            3           0           0           0            6           0   
1            4           2           0           0            0           0   
2            4           2           0           0            6           0   
3            4           0           0           0            6           0   
4            3           0           0           0            0           0   

   C49PMOTSCO  C50PVRAAUT  C51PAANHANG  C52PTRACTOR     ...      C77APERSONG  \
0           0           0            0            0     ...                0   
1           0           0            0            0     ...                0   
2           0           0            0            0     ...                0   
3           0           0            0            0     ...                0   
4           0           0            0            0     ...                0   

   C78AGEZONG  C79AWAOREG  C80ABRAND  C81AZEILPL  C82APLEZIER  C83AFIETS  \
0           0           0          1           0            0          0   
1           0           0          1           0            0          0   
2           0           0          1           0            0          0   
3           0           0          1           0            0          0   
4           0           0          1           0            0          0   

   C84AINBOED  C85ABYSTAND  C86CARAVAN  
0           0            0           0  
1           0            0           0  
2           0            0           0  
3           0            0           0  
4           0            0           0  

[5 rows x 44 columns]
       C43MKOOPKLA   C44PWAPART   C45PWABEDR   C46PWALAND  C47PPERSAUT  \
count  5822.000000  5822.000000  5822.000000  5822.000000  5822.000000   
mean      4.236345     0.771213     0.040021     0.071625     2.970457   
std       2.007150     0.958623     0.362680     0.499980     2.920669   
min       1.000000     0.000000     0.000000     0.000000     0.000000   
25%       3.000000     0.000000     0.000000     0.000000     0.000000   
50%       4.000000     0.000000     0.000000     0.000000     5.000000   
75%       6.000000     2.000000     0.000000     0.000000     6.000000   
max       8.000000     3.000000     6.000000     4.000000     8.000000   

        C48PBESAUT   C49PMOTSCO   C50PVRAAUT  C51PAANHANG  C52PTRACTOR  \
count  5822.000000  5822.000000  5822.000000  5822.000000  5822.000000   
mean      0.048265     0.175369     0.009447     0.020955     0.092580   
std       0.531346     0.897222     0.244675     0.212738     0.603076   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     0.000000     0.000000     0.000000     0.000000   
max       7.000000     7.000000     9.000000     5.000000     6.000000   

          ...       C77APERSONG   C78AGEZONG   C79AWAOREG    C80ABRAND  \
count     ...       5822.000000  5822.000000  5822.000000  5822.000000   
mean      ...          0.005325     0.006527     0.004638     0.570079   
std       ...          0.072782     0.080532     0.077403     0.562058   
min       ...          0.000000     0.000000     0.000000     0.000000   
25%       ...          0.000000     0.000000     0.000000     0.000000   
50%       ...          0.000000     0.000000     0.000000     1.000000   
75%       ...          0.000000     0.000000     0.000000     1.000000   
max       ...          1.000000     1.000000     2.000000     7.000000   

        C81AZEILPL  C82APLEZIER    C83AFIETS   C84AINBOED  C85ABYSTAND  \
count  5822.000000  5822.000000  5822.000000  5822.000000  5822.000000   
mean      0.000515     0.006012     0.031776     0.007901     0.014256   
std       0.022696     0.081632     0.210986     0.090463     0.119996   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     0.000000     0.000000     0.000000     0.000000   
max       1.000000     2.000000     3.000000     2.000000     2.000000   

        C86CARAVAN  
count  5822.000000  
mean      0.059773  
std       0.237087  
min       0.000000  
25%       0.000000  
50%       0.000000  
75%       0.000000  
max       1.000000  

[8 rows x 44 columns]
first 5 rows of TraindropM are:
   C43MKOOPKLA  C44PWAPART  C45PWABEDR  C46PWALAND  C47PPERSAUT  C48PBESAUT  \
0            3           0           0           0            6           0   
1            4           2           0           0            0           0   
2            4           2           0           0            6           0   
3            4           0           0           0            6           0   
4            3           0           0           0            0           0   

   C49PMOTSCO  C50PVRAAUT  C51PAANHANG  C52PTRACTOR     ...      C77APERSONG  \
0           0           0            0            0     ...                0   
1           0           0            0            0     ...                0   
2           0           0            0            0     ...                0   
3           0           0            0            0     ...                0   
4           0           0            0            0     ...                0   

   C78AGEZONG  C79AWAOREG  C80ABRAND  C81AZEILPL  C82APLEZIER  C83AFIETS  \
0           0           0          1           0            0          0   
1           0           0          1           0            0          0   
2           0           0          1           0            0          0   
3           0           0          1           0            0          0   
4           0           0          1           0            0          0   

   C84AINBOED  C85ABYSTAND  C86CARAVAN  
0           0            0           0  
1           0            0           0  
2           0            0           0  
3           0            0           0  
4           0            0           0  

[5 rows x 44 columns]

3.2 Split Train Test for Sampling

Split before SMOTE to avoid data bleeding.

3.2.1 Split Train Test for TraindropM

In [9]:
# Feature size 
XdropM = TraindropM.drop(['C86CARAVAN'], axis=1)
#ydropM = TraindropM['C86CARAVAN']

X_Train_originaldropM,X_TestdropM,y_Train_originaldropM,y_TestdropM= train_test_split(XdropM, y, test_size=0.3,random_state=42)

print('X and y Input Data:   ', XdropM.shape, y.shape)
print('Train Set Shape:       ', X_Train_originaldropM.shape, y_Train_originaldropM.shape)
print('Test Set Shape:       ', X_TestdropM.shape, y_TestdropM.shape)

#y_Train = Train['C86_CARAVAN']
#y_Test = Test['C86_CARAVAN']
#X_Train = Train.drop(['C86_CARAVAN'], axis=1)
#X_Test = Test.drop(['C86_CARAVAN'], axis=1)
#print  "\n\n"

#print "The description of features of trian: \n \n", X_Train_originaldropM.describe()
#print  "\n\n"

#print "The description of y of trian: \n \n", y_Train_originaldropM.describe()
#print  "\n\n"
#print "The description of features of test:\n \n",  X_TestdropM.describe()
#print  "\n\n"
#print "The description of y of trian: \n \n", y_TestdropM.describe()
('X and y Input Data:   ', (5822, 43), (5822L,))
('Train Set Shape:       ', (4075, 43), (4075L,))
('Test Set Shape:       ', (1747, 43), (1747L,))

3.2.2 Split Train Test for TraindropM2

In [10]:
X_TraindropM2 = TraindropM2.drop(['C86CARAVAN'], axis=1)
y_TraindropM2 = TraindropM2['C86CARAVAN']

#scalerT = preprocessing.StandardScaler().fit(Train)
#min_max_scaler = preprocessing.MinMaxScaler()
#Train = scaler.fit_transform(Train)

#print Train

#scaler = preprocessing.StandardScaler().fit(X)
#min_max_scaler = preprocessing.MinMaxScaler()
#X = scaler.fit_transform(X)

#

#print X

X_Train_original,X_TestdropM2,y_Train_original,y_TestdropM2= train_test_split(X_TraindropM2, y_TraindropM2, test_size=0.3,random_state=42)

print('X and y Input Data:   ', X_TraindropM2.shape, y_TraindropM2.shape)
print('Test Set Shape:       ', X_Train_original.shape, y_Train_original.shape)
print('Test Set Shape:       ', X_TestdropM2.shape, y_TestdropM2.shape)


#y_Train = Train['C86_CARAVAN']
#y_Test = Test['C86_CARAVAN']
#X_Train = Train.drop(['C86_CARAVAN'], axis=1)
#X_Test = Test.drop(['C86_CARAVAN'], axis=1)
print  "\n\n"

print "The description of features of trian: \n \n", X_Train_original.describe()
print  "\n\n"

#print "The description of y of trian: \n \n", y_Train_original.describe()
print  "\n\n"
#print "The description of features of test:\n \n",  X_Test.describe()
print  "\n\n"
#print "The description of y of trian: \n \n", y_Test.describe()

#fig = plt.figure(figsize=(10,10))

# Tells the total count of different values in CARAVAN
('X and y Input Data:   ', (5822, 22), (5822L,))
('Test Set Shape:       ', (4075, 22), (4075L,))
('Test Set Shape:       ', (1747, 22), (1747L,))



The description of features of trian: 
 
       C43MKOOPKLA   C44PWAPART   C45PWABEDR   C46PWALAND  C47PPERSAUT  \
count  4075.000000  4075.000000  4075.000000  4075.000000  4075.000000   
mean      4.272147     0.783067     0.045399     0.071656     2.988712   
std       1.989120     0.961997     0.395917     0.500141     2.918357   
min       1.000000     0.000000     0.000000     0.000000     0.000000   
25%       3.000000     0.000000     0.000000     0.000000     0.000000   
50%       4.000000     0.000000     0.000000     0.000000     5.000000   
75%       6.000000     2.000000     0.000000     0.000000     6.000000   
max       8.000000     3.000000     6.000000     4.000000     8.000000   

        C48PBESAUT   C49PMOTSCO   C50PVRAAUT  C51PAANHANG  C52PTRACTOR  \
count  4075.000000  4075.000000  4075.000000  4075.000000  4075.000000   
mean      0.056687     0.163436     0.009816     0.022577     0.098405   
std       0.578604     0.866826     0.238433     0.218175     0.621087   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     0.000000     0.000000     0.000000     0.000000   
max       7.000000     7.000000     6.000000     4.000000     6.000000   

          ...         C55PLEVEN  C56PPERSONG   C57PGEZONG   C58PWAOREG  \
count     ...       4075.000000  4075.000000  4075.000000  4075.000000   
mean      ...          0.205890     0.014479     0.018405     0.027975   
std       ...          0.925324     0.216039     0.211140     0.410590   
min       ...          0.000000     0.000000     0.000000     0.000000   
25%       ...          0.000000     0.000000     0.000000     0.000000   
50%       ...          0.000000     0.000000     0.000000     0.000000   
75%       ...          0.000000     0.000000     0.000000     0.000000   
max       ...          9.000000     6.000000     3.000000     7.000000   

         C59PBRAND   C60PZEILPL  C61PPLEZIER    C62PFIETS   C63PINBOED  \
count  4075.000000  4075.000000  4075.000000  4075.000000  4075.000000   
mean      1.864540     0.000982     0.021840     0.024785     0.012270   
std       1.891979     0.049534     0.300162     0.155489     0.176828   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       2.000000     0.000000     0.000000     0.000000     0.000000   
75%       4.000000     0.000000     0.000000     0.000000     0.000000   
max       8.000000     3.000000     6.000000     1.000000     6.000000   

       C64PBYSTAND  
count  4075.000000  
mean      0.047853  
std       0.409050  
min       0.000000  
25%       0.000000  
50%       0.000000  
75%       0.000000  
max       5.000000  

[8 rows x 22 columns]









3.3 SMOTE of Train data

3.3.1 SMOTE for the Train data: TraindropM

After droping the most sociodemographic variables derived from zip codes

In [11]:
from imblearn.over_sampling import SMOTE

doOversampling = True

if doOversampling:
# Apply regular SMOTE
    sm = SMOTE(kind='regular')
    X_TraindropM, y_TraindropM = sm.fit_sample(X_Train_originaldropM, y_Train_originaldropM)
    print('Training Set Shape after oversampling:   ', X_TraindropM.shape, y_TraindropM.shape)
    print(pd.crosstab(y_TraindropM,y_TraindropM))
else:
    X_TraindropM = X_Train_originaldropM
    y_TraindropM = y_Train_originaldropM
('Training Set Shape after oversampling:   ', (7692L, 43L), (7692L,))
col_0     0     1
row_0            
0      3846     0
1         0  3846

3.3.2 SMOTE for the Train data: TraindropM

In [12]:
doOversampling2 = True

if doOversampling2:
# Apply regular SMOTE
    sm = SMOTE(kind='regular')
    X_TraindropM2, y_TraindropM2 = sm.fit_sample(X_Train_original, y_Train_original)
    print('Training Set Shape after oversampling:   ', X_TraindropM2.shape, y_TraindropM2.shape)
    print(pd.crosstab(y_TraindropM2,y_TraindropM2))
else:
    X_TraindropM2 = X_Train_original
    y_TraindropM2 = y_Train_original

##Transforms features by scaling each feature to a given range
('Training Set Shape after oversampling:   ', (7692L, 22L), (7692L,))
col_0     0     1
row_0            
0      3846     0
1         0  3846

3.4 Feature Importance

  • Factors ordered by F-statistic: the higher the F-statistic, the better prediction potential of the factor
  • The table is sorted after F value
  • used TraindropM2: uses 22 features
  • The top 4 features (F > 44) are also used in capital 6 to analyse the persona of the target group
In [13]:
from statsmodels.stats import anova
import statsmodels.api as sm    
from statsmodels.formula.api import ols

mtmodel1 = ols('C86CARAVAN ~ C43MKOOPKLA + C44PWAPART + C45PWABEDR + C46PWALAND + C47PPERSAUT +   C48PBESAUT + C49PMOTSCO + C50PVRAAUT + C51PAANHANG + C52PTRACTOR + C53PWERKT +   C54PBROM + C55PLEVEN + C56PPERSONG + C57PGEZONG + C58PWAOREG + C59PBRAND +  C60PZEILPL + C61PPLEZIER + C62PFIETS + C63PINBOED + C64PBYSTAND ', Train).fit()    
#Anova table for one or more fitted linear models.Single factor analysis, prediction power : http://www.statisticshowto.com/probability-and-statistics/f-statistic-value-test/  # http://www.statisticshowto.com/support-or-reject-null-hypothesis/
# An F statistic is a value you get when you run an ANOVA test or a regression analysis to find out if the means between two populations are significantly different. 
anovatable = sm.stats.anova_lm(mtmodel1)
anovatable_sorted = anovatable.sort_values(['df', 'sum_sq', 'mean_sq','F','PR(>F)'], ascending =  [False,False, False, True,False])
anovatable_sorted.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
#print(anova.anova_lm(mt_model1))
c:\users\chenp\anaconda2\lib\site-packages\statsmodels\compat\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
c:\users\chenp\anaconda2\lib\site-packages\statsmodels\stats\anova.py:129: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  table.ix[index, ['df', 'sum_sq']] = np.c_[arr[~idx].sum(1), sum_sq]
c:\users\chenp\anaconda2\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
c:\users\chenp\anaconda2\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
c:\users\chenp\anaconda2\lib\site-packages\scipy\stats\_distn_infrastructure.py:1821: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
Out[13]:
df sum_sq mean_sq F PR(>F)
Residual 5799 310.152 0.0534836 nan nan
C47PPERSAUT 1 6.15912 6.15912 115.159 1.28963e-26
C43MKOOPKLA 1 3.01159 3.01159 56.3086 7.12959e-14
C44PWAPART 1 2.97152 2.97152 55.5594 1.03987e-13
C61PPLEZIER 1 2.30267 2.30267 43.0538 5.78743e-11
C59PBRAND 1 0.885499 0.885499 16.5564 4.78504e-05
C64PBYSTAND 1 0.462721 0.462721 8.65164 0.0032806
C62PFIETS 1 0.383858 0.383858 7.1771 0.00740482
C58PWAOREG 1 0.349533 0.349533 6.53532 0.0106006
C57PGEZONG 1 0.244451 0.244451 4.57058 0.0325673
C51PAANHANG 1 0.0814104 0.0814104 1.52216 0.217344
C53PWERKT 1 0.0525914 0.0525914 0.983317 0.321423
C46PWALAND 1 0.0309153 0.0309153 0.578033 0.447115
C60PZEILPL 1 0.0278204 0.0278204 0.520166 0.4708
C54PBROM 1 0.0219015 0.0219015 0.409498 0.522249
C50PVRAAUT 1 0.0208939 0.0208939 0.39066 0.531977
C48PBESAUT 1 0.0125388 0.0125388 0.234442 0.628268
C49PMOTSCO 1 0.00723325 0.00723325 0.135242 0.713071
C56PPERSONG 1 0.00637221 0.00637221 0.119143 0.729978
C52PTRACTOR 1 0.00560445 0.00560445 0.104788 0.746169
C63PINBOED 1 0.00487344 0.00487344 0.0911202 0.762769
C55PLEVEN 1 0.00405367 0.00405367 0.0757927 0.783092
C45PWABEDR 1 0.000169772 0.000169772 0.00317428 0.955072

3.5 Select top 8 features (F Value >5) from TraindropM2 based on the feature importance

TrainselecdropM

In [14]:
#Xselec = (Train[Train.columns[[47,44,61,1,16,10,82,59]]].values)
#print Xselec

#select top 8 features to avoid overfit 

TrainselecdropM = TraindropM2[["C47PPERSAUT","C44PWAPART", "C61PPLEZIER", "C43MKOOPKLA","C59PBRAND","C64PBYSTAND","C58PWAOREG","C62PFIETS"]]
                                       
         
print (TrainselecdropM.describe())

TrainselecdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )

XselecdropM = TrainselecdropM
#yselecdropM = TraindropM['C86CARAVAN']


X_TrainselecdropM,X_TestselecdropM,y_TrainselecdropM,y_TestselecdropM= train_test_split(XselecdropM, y, test_size=0.3,random_state=42)

print('X and y Input Data:   ', XselecdropM.shape, y.shape)
print('Train Set Shape:       ', X_TrainselecdropM.shape, y_TrainselecdropM.shape)
print('Test Set Shape:       ', X_TestselecdropM.shape, y_TestselecdropM.shape)


#y_Train = Train['C86_CARAVAN']
#y_Test = Test['C86_CARAVAN']
#X_Train = Train.drop(['C86_CARAVAN'], axis=1)
#X_Test = Test.drop(['C86_CARAVAN'], axis=1)
#print  "\n\n"

#print "The description of features of trian with top 8 selected features: \n \n", X_TrainselecdropM.describe()
#print  "\n\n"

#print "The description of y of trian with top 8 selected features: \n \n", y_TrainselecdropM.describe()
#print  "\n\n"
#print "The description of features of test with top 8 selected features:\n \n",  X_TestselecdropM.describe()
#print  "\n\n"
#print "The description of y of trian with top 8 selected features: \n \n", y_TestselecdropM.describe()
       C47PPERSAUT   C44PWAPART  C61PPLEZIER  C43MKOOPKLA    C59PBRAND  \
count  5822.000000  5822.000000  5822.000000  5822.000000  5822.000000   
mean      2.970457     0.771213     0.018894     4.236345     1.827722   
std       2.920669     0.958623     0.273028     2.007150     1.879290   
min       0.000000     0.000000     0.000000     1.000000     0.000000   
25%       0.000000     0.000000     0.000000     3.000000     0.000000   
50%       5.000000     0.000000     0.000000     4.000000     2.000000   
75%       6.000000     2.000000     0.000000     6.000000     4.000000   
max       8.000000     3.000000     6.000000     8.000000     8.000000   

       C64PBYSTAND   C58PWAOREG    C62PFIETS  
count  5822.000000  5822.000000  5822.000000  
mean      0.047578     0.023531     0.025249  
std       0.409016     0.375274     0.156894  
min       0.000000     0.000000     0.000000  
25%       0.000000     0.000000     0.000000  
50%       0.000000     0.000000     0.000000  
75%       0.000000     0.000000     0.000000  
max       5.000000     7.000000     1.000000  
('X and y Input Data:   ', (5822, 8), (5822L,))
('Train Set Shape:       ', (4075, 8), (4075L,))
('Test Set Shape:       ', (1747, 8), (1747L,))

3.6 Feature correlation of the selected top 8 features

In [15]:
TrainselecdropM.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
Out[15]:
C47PPERSAUT C44PWAPART C61PPLEZIER C43MKOOPKLA C59PBRAND C64PBYSTAND C58PWAOREG C62PFIETS
C47PPERSAUT 1.0 0.15 0.025 0.023 0.091 0.078 -0.0074 -0.04
C44PWAPART 0.15 1.0 0.0021 0.012 0.48 0.044 -0.0017 -0.011
C61PPLEZIER 0.025 0.0021 1.0 0.019 0.019 0.01 -0.0043 -0.011
C43MKOOPKLA 0.023 0.012 0.019 1.0 0.1 0.071 0.018 0.037
C59PBRAND 0.091 0.48 0.019 0.1 1.0 0.061 0.065 -0.038
C64PBYSTAND 0.078 0.044 0.01 0.071 0.061 1.0 -0.0073 0.008
C58PWAOREG -0.0074 -0.0017 -0.0043 0.018 0.065 -0.0073 1.0 -0.01
C62PFIETS -0.04 -0.011 -0.011 0.037 -0.038 0.008 -0.01 1.0

4. Train, test, evaluate the Models and generate output file

4.1 The 1st Pioneer Base Model to check how the Prediction Result would be like

Train, test and evaluate the Random Forest Classifier with the Train data after droping the most sociodemographic variables derived from zip codes

4.1.1 Train and Test the Random Forest classifier

We choose to use TraindropM2 to evaluate the model, since this data set are the cleanst of the three canditate data sets.

In [16]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier()
clf2.fit(X_TraindropM2, y_TraindropM2)
test_predictions2 = clf2.predict(X_TestdropM2)

#define the Colorcodes 
CBLACK  = '\33[30m'
CRED    = '\33[31m'
CGREEN  = '\33[32m'
CYELLOW = '\33[33m'
CBLUE   = '\33[34m'
CVIOLET = '\33[35m'
CBEIGE  = '\33[36m'
CWHITE  = '\33[37m'

CBLACKBG  = '\33[40m'
CREDBG    = '\33[41m'
CGREENBG  = '\33[42m'
CYELLOWBG = '\33[43m'
CBLUEBG   = '\33[44m'
CVIOLETBG = '\33[45m'
CBEIGEBG  = '\33[46m'
CWHITEBG  = '\33[47m'

CGREY    = '\33[90m'
CEND = '\033[0m'

4.1.2 Evaluate the model

In [19]:
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score,f1_score,accuracy_score,log_loss,confusion_matrix
from __future__ import division

print('****ResultsselecdropM****')
print('\n----------------Unhelpful Scores\n')

test_predictionsdropMR = clf2.predict(X_TestdropM2)
accdropMR = f1_score(y_TestdropM2, test_predictionsdropMR)
print("F-scoreselecdropM: {:.2%}".format(accdropMR))
    
test_predictionsdropMR = clf2.predict(X_TestdropM2)
acc2dropMR = accuracy_score(y_TestdropM2, test_predictionsdropMR)

print('Model accuracyselecdropM: {:.2%} '.format(acc2dropMR))
    
    
print('\n----------------Useful Scores: loss and cost-benefit scores\n')
    
print('ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): \nIf both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.')

RocScoreR=roc_auc_score(y_TestdropM2, clf2.predict(X_TestdropM2))
fprBR, tprBR, thresholdsBR = roc_curve(y_TestdropM2, clf2.predict_proba(X_TestdropM2)[:,1])
RocScoreTrainR=roc_auc_score(y_TraindropM2, clf2.predict(X_TraindropM2))
fprBTrainR, tprBTrainR, thresholdsBTrainR = roc_curve(y_TraindropM2, clf2.predict_proba(X_TraindropM2)[:,1])
    
print RocScoreR
print RocScoreTrainR
plt.figure()
plt.plot(fprBR, tprBR,  label='classifiersTest' % RocScoreR)
plt.plot(fprBTrainR, tprBTrainR, label='classifiersTrain' % RocScoreTrainR)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()       
    
test_predictionsdropMR = clf2.predict_proba(X_TestdropM2)
llselecdropMR = log_loss(y_TestdropM2, test_predictionsdropMR)
print("Log LossselecdropM: {:.2f}".format(llselecdropMR))

print"Confusion Matrix: \n", confusion_matrix(y_TestdropM2, clf2.predict(X_TestdropM2))

test_predictionsselecdropMR = clf2.predict(X_TestdropM2)
confusionselecdropMR = metrics.confusion_matrix(y_TestdropM2, test_predictionsselecdropMR)
TNselecdropMR = confusionselecdropMR[0, 0]
TPselecdropMR = confusionselecdropMR[1, 1]
FNselecdropMR = confusionselecdropMR[1, 0]
FPselecdropMR = confusionselecdropMR[0, 1]
BenefitItemselecdropMR = TPselecdropMR 
BenefitCoselecdropMR = TPselecdropMR / (TPselecdropMR + FNselecdropMR) # this is specificity in statistics 
print("BenefitItemselecdropM: {}".format(BenefitItemselecdropMR))
print("BenefitCoselecdropM: {:.2%}".format(BenefitCoselecdropMR))
    
CostItemselecdropMR = (TPselecdropMR + FPselecdropMR)
CostCoselecdropMR = (TPselecdropMR + FPselecdropMR) / (TPselecdropMR + TNselecdropMR + FPselecdropMR +FNselecdropMR) 
          
print("CostItemselecdropM: {}".format(CostItemselecdropMR))
print("CostCoselecdropM: {:.2%}".format(CostCoselecdropMR))

if CostCoselecdropMR == 0: 
    ImproveRatioselecdropMR = 0
else: 
    ImproveRatioselecdropMR = format((BenefitItemselecdropMR/CostItemselecdropMR) /((TPselecdropMR+FNselecdropMR) /(TPselecdropMR + TNselecdropMR + FPselecdropMR +FNselecdropMR)), '.2%')    
print(CBLUE+"ImproveRatioselecdropM: {}".format(ImproveRatioselecdropMR)+CEND)
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 25.98%
Model accuracyselecdropM: 89.24% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.6073286808580926
0.906786271450858
Log LossselecdropM: 0.71
Confusion Matrix: 
[[1526  102]
 [  86   33]]
BenefitItemselecdropM: 33
BenefitCoselecdropM: 27.73%
CostItemselecdropM: 135
CostCoselecdropM: 7.73%
ImproveRatioselecdropM: 358.86%

4.1.3 Feature importance evaluated based on Random Forest Classifier 2 (used the picked train dataset with out the zip codes features)

In [20]:
#features = Test.drop(['CARAVAN'], axis=1)

importances2 = clf2.feature_importances_
std2 = np.std([tree.feature_importances_ for tree in clf2.estimators_],
             axis=0)
indices2 = np.argsort(importances2[0:22])[::-1]

indices = indices2[0:22]

# Print the feature ranking
print("Feature ranking:")

#features = Train.columns

for f in range(22):
    print("%d. %s (%f)" % (f + 1, (TraindropM2.columns.values[:22]).reshape(-1)[indices[f]], importances2[indices[f]]))

# Plot the feature importances of the forest
#import pylab as pl
plt.figure(figsize=(14, 3))
plt.title("Feature importances")
plt.bar(range(22), importances2[indices], yerr=std2[indices], color="steelblue", align="center")

plt.yticks(size=14,color="#201506")
plt.xticks(range(22), TraindropM2.columns.values[:22].reshape(-1)[indices], rotation='vertical',size=12,color="#201506")
plt.xlim([-1, 12])
plt.show()
Feature ranking:
1. C43MKOOPKLA (0.311930)
2. C47PPERSAUT (0.181307)
3. C59PBRAND (0.171922)
4. C44PWAPART (0.130422)
5. C62PFIETS (0.031299)
6. C54PBROM (0.030412)
7. C55PLEVEN (0.025091)
8. C61PPLEZIER (0.022949)
9. C49PMOTSCO (0.016648)
10. C64PBYSTAND (0.013073)
11. C58PWAOREG (0.011999)
12. C51PAANHANG (0.009459)
13. C45PWABEDR (0.008019)
14. C46PWALAND (0.007852)
15. C57PGEZONG (0.007003)
16. C48PBESAUT (0.006832)
17. C52PTRACTOR (0.006477)
18. C63PINBOED (0.003916)
19. C56PPERSONG (0.001769)
20. C53PWERKT (0.000639)
21. C60PZEILPL (0.000596)
22. C50PVRAAUT (0.000385)

4.2 Models

  • The hyperparameters for LGBMClassifier are selected based on the Automated Hyperparameter Optimization method published by Will Koehrsen in Github (https://github.com/WillKoehrsen).
  • The below models, if they have hyperparameters, then they are selected based on the multiple manuel experiments and experience.If they don´t have hyperparameters, then it´s just base models. There are three groups of features: 1) Holdout:
  • TraindropM (43 features)
  • TrainselecdropM (top 8 features)
  • TrainselecdropM2 (22 features) I classify the models that I apply in 5 groups:
  • #Gruppe 1: K Neighbors
  • #Gruppe 2: Boosting
  • #Gruppe 3: Trees
  • #Gruppe 4: Bayes
  • #Gruppe 5: non-linear classification SVM using RBF kernel Trick
In [21]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import *
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from IPython.display import display
from itertools import compress
from math import isnan
from sklearn import tree
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
from __future__ import division


classifiers = [
#Gruppe 1: K Neighbors 
    #Classification is computed from a simple majority vote of the nearest neighbors of each point: 
    #a query point is assigned the data class which has the most representatives within the nearest neighbors of the point.
    KNeighborsClassifier(3),
    
#Gruppe 2: Boosting  
    # Boosting is a machine learning ensemble meta-algorithm for primarily reducing bias, and also variance in supervised learning, and a family of machine learning algorithms that convert weak learners to strong ones.
    
    # An AdaBoost classifier is a meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases.
    AdaBoostClassifier(),
    
    # GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage n_classes_ regression trees are fit on the negative gradient of the binomial or multinomial deviance loss function. Binary classification is a special case where only a single regression tree is induced.
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42),
    
    # What: Light GBM is a gradient boosting framework that uses tree based learning algorithm.    
    # Pros: handle the large size of data and takes lower memory to run;focuses on accuracy of results.
    # Cons: Light GBM is sensitive to overfitting and can easily overfit small data. 
    lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.6311794044268164,
        learning_rate=0.027802518491219938, max_depth=-1, metric='auc',
        min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=50, reg_alpha=0.06183118355912668,
        reg_lambda=0.24742831407472365, silent=True,
        subsample=0.999742610271968, subsample_for_bin=280000,
        subsample_freq=1, verbose=1),

  #  Neural Network performs not so good and actually not as suitable as other models in this situation
  #  MLPClassifier(activation='relu', alpha=1e-05,
  #     batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
  #     epsilon=1e-08, hidden_layer_sizes=(64), learning_rate='constant',
  #     learning_rate_init=0.001, max_iter=2000, momentum=0.9,
  #     nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
  #     tol=0.001, validation_fraction=0.1, verbose=True,
  #     warm_start=False),
    
#Gruppe 3: Trees 
    
    # lgb.LGBMClassifier and GradientBoostingClassifier in Gruppe 1 also uses tree methods 

    # Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
    # Pruning:   Remove test nodes whose leaves have less than 𝜏 instances.  Collect in new leaf node that is labeled with the majority class 
    # Pruning parameter 𝜏 is a regularization parameter that has to be tuned (e.g., by cross validation).
    DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=15, 
                                min_samples_split=2, min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, max_features=None, 
                                max_leaf_nodes=None, min_impurity_decrease=1e-07),
   
    # A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is always the same as the original input sample size but the samples are drawn with replacement if bootstrap=True (default).
    # Random Forests are an improvement over bagged decision trees.
    # In statistics, bootstrapping is any test or metric that relies on random sampling with replacement. #Bagging: Bootstrap aggregating, also called bagging, is a machine learning ensemble meta-algorithm designed to improve the stability and accuracy of machine learning algorithms used in statistical classification and regression. It also reduces variance and helps to avoid overfitting. Although it is usually applied to decision tree methods, it can be used with any type of method. 
    RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=15,
                                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                max_features='auto', max_leaf_nodes=None, min_impurity_decrease=1e-07, 
                                bootstrap=True, oob_score=False, n_jobs=1, 
                                random_state=42, verbose=1, warm_start=False, class_weight='balanced_subsample'),
    # RF vs ET: Both methods are about the same, with the ET being a bit worse when there is a high number of noisy features (in high dimensional data-sets).That said, provided the (perhaps manual) feature selection is near optimal, the performance is about the same, however, ET's can be computationally faster.
    
    # This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.
    # The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
    # The “balanced_subsample” mode is the same as “balanced” except that weights are computed based on the bootstrap sample for every tree grown.
    ExtraTreesClassifier(n_estimators=500, criterion='gini', max_depth=15,
                                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                max_features='auto', max_leaf_nodes=None, min_impurity_decrease=1e-07, 
                                bootstrap=True, oob_score=False, n_jobs=1, 
                                random_state=42, verbose=1, warm_start=False, class_weight='balanced_subsample'),

#Gruppe 4: Bayes   
    
    # Bayes‘ equation: 𝑃(𝜃|𝐗,𝐲) =(𝑃(𝐲|𝐗,𝜃)/𝑃(𝜃))/ 𝑃(𝐲|𝐗)       
    # Classification: a posteriori (“posterior”) probability that θ is the correct parameter given observations 𝐲|𝐗.
    # 𝑃(𝜃): A priori (“prior”) probability of nature choosing θ: System parameter 𝜃∗ (randomly)
    # 𝑃(𝐲|𝐗,𝜃): Likelihood of observing 𝐲|𝐗 when model parameter is 𝜃.
    # Probability of observing 𝐲|𝐗; independent of 𝜃.
    # Maximum-likelihood(ML):𝜃ML = argmaxbelow𝜃 (𝑃(𝐲|𝐗,𝜃)) = argminbelow𝛉 ∑𝑖=1 𝑛 (log(1 + ehoch(−𝑦𝑖𝐱𝑖T𝛉))) <ML:Logistic Regression,using (stochastic) gradient descent>
    # Maximum-a-positeriori(MAP): 𝜃MAP = argmaxbelow𝜃 (𝑃(𝜃|𝐲,𝐗)) = argminbelow𝛉 ∑𝑖=1 𝑛 (log(1 + e(hoch−𝑦𝑖𝐱𝑖T𝛉)) + (1/2𝜎((low𝑝)(hoch2))𝛉T𝛉  A posteriori (“posterior”) distribution: a posteriori (“posterior”) probability that θ is the correct parameter given observations 𝐲|𝐗.
    # Bayes’ theorem Classification: Predictive distribution given the data 𝑃 (𝑦|𝐱∗,𝐲,𝐗) = ∫(𝑃(𝑦|𝛉,𝐱∗)𝑃(𝛉|𝐲,𝐗)d𝛉= ∫(1/(1 + e−𝑦𝐱∗T))𝛉𝑁(𝛉|𝟎,𝜎(hoch2)𝐈)𝑑𝛉. No closed-form solution for logistic regression.  Possible to approximate by sampling from the posterior.  Standard approximation: use only MAP model instead of integrating over model space.

    # Bayes’ theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event. 
    
    # P(A\B)= (P(B\A)\P(A))\(P(B))where A and B are events
    # P(A\B) is a conditional probability: the likelihood of event A occurring given that B is true.
    # {P(B\ A)}  is also a conditional probability: the likelihood of event B occurring given that {\displaystyle A} A is true.
    # P(A) and P(B) are the probabilities of observing  A and B independently of each other; this is known as the marginal probability.
    
    #Gaussian Naive Bayes
    #GaussianNB(),
    #Gaussian: It is used in classification and it assumes that features follow a normal distribution.
    #Multinomial: It is used for discrete counts. For example, let’s say,  we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.
    
    # Linear Bayes classification ##http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
    #A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.
    #The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.
    #The fitted model can also be used to reduce the dimensionality of the input by projecting it to the most discriminative directions.
    # PDF(x|k)=ehoch(−d/2) / ((2π)hoch(p/2)) /  square root(|S|) see https://stats.stackexchange.com/questions/31366/linear-discriminant-analysis-and-bayes-rule-classification/31384#31384 The relationship of LDA and regression is here: https://stats.stackexchange.com/questions/31459/what-is-the-relationship-between-regression-and-linear-discriminant-analysis-ld 
    LinearDiscriminantAnalysis(),
    
    # Quadratic Discriminant Analysis
    # QuadraticDiscriminantAnalysis(),
    
    #Bernoulli: The binomial model is useful if your feature vectors are binary. One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.
    #Naive Bayes is a simple technique for constructing classifiers: models that assign class labels to problem instances, represented as vectors of feature values, where the class labels are drawn from some finite set. #There is not a single algorithm for training such classifiers, but a family of algorithms based on a common principle: all naive Bayes classifiers assume that the value of a particular feature is independent of the value of any other feature, given the class variable.
    #Pros:It is easy and fast to predict class of test data set. It also perform well in multi class predictionWhen assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data.It perform well in case of categorical input variables compared to numerical variable(s). For numerical variable, normal distribution is assumed (bell curve, which is a strong assumption).
    #Cons:If categorical variable has a category (in test data set), which was not observed in training data set, then model will assign a 0 (zero) probability and will be unable to make a prediction. This is often known as “Zero Frequency”. To solve this, we can use the smoothing technique. One of the simplest smoothing techniques is called Laplace estimation.On the other side naive Bayes is also known as a bad estimator, so the probability outputs from predict_proba are not to be taken too seriously.Another limitation of Naive Bayes is the assumption of independent predictors. In real life, it is almost impossible that we get a set of predictors which are completely independent.
    BernoulliNB(alpha=1.0,fit_prior = True),
    
    
    #Logictic Regression uses Log_Loss,but SVM Hinge Loss; Moreover Log_loss is not suitble for Naive Bayes 
    #Logistic Regression and SVM can both be High dimensional features with kernels

    #Empirical risk minimization  Gradient descent method  Inexact line search  Stochastic gradient descent methods
    #The performance of SGDClassifier is not so good. Moreover, since this SGDClassifier normally uses Hinge loss, so I cannot calculate the ROC, which I use in the evaluation for all the classifiers, unless I set the loss to 'log'. so I commented this classifier. 
    #SGDClassifier(loss = 'log',penalty = 'elasticnet'),
    
    ####Linear classification Bayes####: Cost sensitive: Called multi-class “logistic regression” even though it is a classification model, linear, belongs to Bayes methods
    #For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
    #‘liblinear’ and ‘saga’ handle L1 penalty: square loss function is both convex and smooth and matches the 0–1 indicator function when {yf({\vec {x}})=0} yf({\vec  {x}})=0 and when {yf({\vec {x}})=1} yf({\vec  {x}})=1. However, the square loss function tends to penalize outliers excessively, leading to slower convergence rates (with regards to sample complexity) than for the logistic loss or hinge loss(L2) functions.https://www.csie.ntu.edu.tw/~cjlin/liblinear/ 
    # Softmax Function	Sigmoid Function: Logistic Regression generalize sigmoid function to softmax function 𝑃(𝑦|𝐱,𝛉) =ehoch𝐱T𝛉𝑦 / ∑low𝑦′ehoch𝐱T𝛉low𝑦′for 𝛉 

    #  For binary classification, 𝑦 ∈ {−1,+1} #Decision boundary is a hyperplane in input space.
    # 𝑃(𝑦=+-1|𝐱,𝛉) = 𝜎 (𝑦𝐱T𝛉) = 1 / (1+(ehoch(−𝑦𝐱T𝛉))) 
    LogisticRegression(solver='liblinear', max_iter=1000, 
                             random_state=42,verbose=2,class_weight='balanced'),    # class_weight='balanced' # penalize
    
    #LogisticRegression(solver='saga', max_iter=1000, random_state=42,verbose=2),

# Gruppe 5: non-linear classification SVM using RBF kernel Trick 
    
    # Defi: Given a set of training examples, each marked as belonging to one or the other of two categories, an SVM training algorithm builds a model that assigns new examples to one category or the other, making it a non-probabilistic binary linear classifier (although methods such as Platt scaling exist to use SVM in a probabilistic classification setting). 
    # Defi: An SVM model is a representation of the examples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. 
    # SVM: SVM classifier with Gaussian kernel: RBF, Dual classifier, uses kernal trick:Gaussian kernel: RBF, uses squared Euclidean distance 
    
    # class_weight='balanced' # penalize
    # Kernel functions can be understood as a measure of similarity between instances. 
    # Primal view on data: “what does 𝐱 look like?” 
    # Dual view on data: “how similar is 𝐱 to each training instance?” 
    # Primal view: 𝑓𝛉𝐱 = 𝛉T𝜙𝐱  Model 𝛉 has as many parameters as the dimensionality of 𝜙 𝐱 .  Good if there are many examples with few attributes. 
    # Dual view: 𝑓𝛂𝐱 = 𝛂T𝚽𝜙𝐱  Model 𝛂 has as many parameters as there are examples.   Good if there are few examples with many attributes.  The representation 𝜙 𝐱 can even be infinite dimensional, as long as the inner product can be computed efficiently.
    # Kernel Ridge Regression # Squared loss: ℓ𝟐 𝑓𝛉 𝐱𝑖 ,𝑦𝑖 = 𝑓𝛉𝐱𝑖 −𝑦𝑖𝟐  L2 regularization: Ω2 𝛉 = ||𝛉||22 ## Minimize  𝑳 𝛉 = 𝛉T𝜙 𝐱 −𝑦𝑖 2 +𝜆𝛉T𝛉
    # Optimization criterion of the dual SVM: max 𝛃𝛽𝑖 − 𝑛𝑖=11 2 𝛽𝑖𝛽𝑗𝑦𝑖𝑦𝑗𝑘 𝐱𝑖,𝐱𝑗 𝑛 𝑖,𝑗=1 Optimization over parameters 𝛃.  Solution found with QP-Solver in 𝑂 𝑛2 .  Sparse solution.  Samples only appear as pairwise inner products.
    # Primal SVM:  Solution is a Vector 𝛉 in the space of the attributes.  Dual SVM:  The same solution is represented as weights 𝛽𝑖 of the samples. 
    # Kernel matrices are symmetric: 𝐊 = 𝐊T  Kernel matrices 𝐊 ∈ ℝ𝑛×𝑛 are positive semidefinite: ∃𝚽 ∈ ℝ𝑛×𝑚:𝐊 = 𝚽𝚽T  Kernel function 𝑘 𝐱,𝐱′ is positive semidefinite if 𝐊 is positive semidefinite for every data set.  For every positive definite function 𝑘 there is at least one mapping 𝜙 𝐱 such that 𝑘 𝐱,𝐱′ = 𝜙 𝐱 T𝜙 𝐱′ for all 𝐱 and 𝐱′.
    # Polynomial kernels: 𝑘𝑝𝑜𝑙𝑦 𝐱𝑖,𝐱𝑗 = 𝐱𝑖 T𝐱𝑗 +1 𝑝  Radial basis functions: 𝑘𝑅𝐵𝐹 𝐱𝑖,𝐱𝑗 = 𝑒−𝛾 𝐱𝑖−𝐱𝑗 2  Sigmoid kernels,   Dynamic time-warping kernels,  String kernels,  Graph kernels,
    # Kernel function 𝑘 𝐱,𝐱′ = 𝜙 𝐱 T𝜙 𝐱′ computes the inner product of the feature mapping of  instances.  The kernel function can often be computed without an explicit representation 𝜙 𝐱 .  E.g., polynomial kernel:  𝑘𝑝𝑜𝑙𝑦 𝐱𝑖,𝐱𝑗 = 𝐱𝑖 T𝐱𝑗 +1 𝑝  Infinite-dimensional feature mappings are possible  Eg., RBF kernel:  𝑘𝑅𝐵𝐹 𝐱𝑖,𝐱𝑗 = 𝑒−𝛾 𝐱𝑖−𝐱𝑗 2 Kernel functions for time series, strings, graphs, …  For a given kernel matrix, the Mercer map provides a feature mapping. Useful if a learning problem is given as a kernel function but learning should take place in the primal.  For example if the kernel matrix will be too large (quadratic memory consumption!
    # Representer Theorem:  𝑓𝛉∗ 𝐱 = 𝛼𝑖 ∗𝜙 𝐱𝑖 T𝜙 𝐱𝑛 𝑖=1   Instances only interact through inner products  Great for few instances, many attributes  Kernel learning algorithms:  Kernel ridge regression  Kernel perceptron, SVM
    # Kernel: 𝑘𝑅𝐵𝐹 𝐱𝑖,𝐱𝑗 = exp hoch(−𝛾𝐱𝑖 −𝐱𝑗hoch2)  No finite-dimensional feature mapping 𝜙.
    # Empirical risk minimization for a classification problem with a 0-1 loss function is known to be an NP-hard problem even for such a relatively simple class of functions as linear classifiers.[2] Though, it can be solved efficiently when the minimal empirical risk is zero, i.e. data is linearly separable.
    # Empirical risk minimization  Gradient descent method  Inexact line search  Stochastic gradient descent methods
    # In practice, machine learning algorithms cope with that either by employing a convex approximation to the 0-1 loss function (like hinge loss for SVM), which is easier to optimize, or by imposing assumptions on the distribution {\displaystyle P(x,y)} P(x,y) (and thus stop being agnostic learning algorithms to which the above result applies).
    # Support vector machines  Gradient or stochastic gradient, hinge loss, L2regularizer.  Maximizes margin between instances and plane.
    SVC(C=10, class_weight='balanced', gamma='auto', kernel='rbf',
              max_iter=-1, probability=True, random_state=42, verbose=True)]   # Linear: Etra Trees: This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.


#from costcla.sampling import cost_sampling
#from costcla.metrics import savings_score
#from costcla import models
#data = TraindropM.ix[:, 1:-5]
#sets = train_test_split(X_TraindropM, y, cost_mat =[[],[],[],[]], test_size=0.3,random_state=42) 
#X_trainC, X_testC, y_trainC, y_testC, cost_mat_trainC, cost_mat_testC = sets
#y_pred_test_lr = LogisticRegression(random_state=0).fit(X_trainC, y_trainC).predict(X_testC)
#f = CostSensitiveLogisticRegression()
#f.fit(X_trainC, y_trainC, cost_mat_trainC)
#y_pred_test_cslr = f.predict(X_testC)

4.3 Train, test and evaluate the models

4.3.1 Train, test and evaluate the models using train_test_split

4.3.1.1 Train, test and evaluate the models with the datasets: Smote X_TraindropM; y_TraindropM

In [22]:
# define the confusion matrix 
import csv
from sklearn.metrics import brier_score_loss
#lldropM = log_loss(y_TestdropM, test_predictionsdropM
#def logloss(true_label, lldropM):
    #if true_label == 1:
       # return -log(lldropM)
  #  else:
       # return -log(1 - lldropM)

def draw_confusion_matricesdropM(confusion_matriciesdropM,class_namesdropM):
    class_namesdropM = class_namesdropM.tolist()
    for cm in confusion_matricesdropM:
        classifier, cm = cm[0], cm[1]
        print'Confusion matrixdropM:\n', cm              
        fig = plt.figure()
        ax = fig.add_subplot(111)
        sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells 
        plt.ylabel('True')
        plt.xlabel('Predicted')    
        ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
        plt.show()   

class_namesdropM = np.unique(np.array(y_TestdropM))

# Logging for Visual Comparison
log_colsUdropM = ["Classifier", "F-score","Accuracy"]
logUdropM = pd.DataFrame(columns=log_colsUdropM)

log_colsdropM=["Classifier", "Log Loss","OverfittingRoc","BLoss","BI","BO", "CI", "CO", "IR","BTOR","PR"] 
logdropM = pd.DataFrame(columns=log_colsdropM)

log_colsCBAdropM =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAdropM = pd.DataFrame(columns=log_colsCBAdropM)


for clf in classifiers:
    clf.fit(X_TraindropM, y_TraindropM)
    namedropM = clf.__class__.__name__
    
    print("="*110)
    print(namedropM)
    print(str(clf));print('\n')
    
    print('****************ResultsdropM****************')
    print('\n----------------Unhelpful Scores\n')
   
    # The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. 
    # F1 = 2 * (precision * recall) / (precision + recall)#precision: tp / (tp + fp) # Recall: tp / (tp + fn) 
    # We do not care about fp: not caravan predicted to be caraven, but f1 uses precision and precision uses fp, so this rate is not so suitble in this case
    test_predictionsdropM = clf.predict(X_TestdropM)
    accdropM = metrics.f1_score(y_TestdropM, test_predictionsdropM)
    print("F-scoredropM: {:.2%}".format(accdropM))
    
    # In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
    # In multilabel classification, the function returns the subset accuracy. If the entire set of predicted labels for a sample strictly match with the true set of labels, then the subset accuracy is 1.0; otherwise it is 0.0.
    # We do not care so much, if we can predict "0" right or not, and the data is very imbalanced, since it has lots "0"
    test_predictionsdropM = clf.predict(X_TestdropM)
    acc2dropM = accuracy_score(y_TestdropM, test_predictionsdropM)

    print('Model accuracydropM: {:.2%} '.format(acc2dropM))
    
    # Example of Receiver Operating Characteristic (ROC) metric to evaluate classifier output quality.
    # ROC curves typically feature true positive rate on the Y axis, and false positive rate on the X axis. This means that the top left corner of the plot is the “ideal” point - a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that a larger area under the curve (AUC) is usually better.
    # The “steepness” of ROC curves is also important, since it is ideal to maximize the true positive rate while minimizing the false positive rate.
    print('ROC just use this to check overfitting:\n ')
    #If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting
    test_predictionsdropM = clf.predict(X_TestdropM)
    RocScore=roc_auc_score(y_TestdropM, test_predictionsdropM)
    fprB, tprB, thresholdsB = roc_curve(y_TestdropM, clf.predict_proba(X_TestdropM)[:,1])
    RocScoreTrain=roc_auc_score(y_TraindropM, clf.predict(X_TraindropM))
    fprBTrain, tprBTrain, thresholdsBTrain = roc_curve(y_TraindropM, clf.predict_proba(X_TraindropM)[:,1])
    
    OverfittingRoc=float(format(RocScore-RocScoreTrain,'.2f'))

    print RocScore
    print RocScoreTrain
    plt.figure()
    plt.plot(fprB, tprB, label='classifiersTest' % RocScore)
    plt.plot(fprBTrain, tprBTrain, label='classifiersTrain' % RocScoreTrain)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    
    print('\n----------------Useful Scores: loss and cost-benefit scores\n')
    # Trees and Logictic Regression uses Log_Loss,but SVM Hinge Loss; Moreover Log_loss is not suitble for Naive Bayes 
    # Log loss, aka logistic loss or cross-entropy loss.
    # Log loss, aka logistic loss or cross-entropy loss.
    # used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of the true labels given a probabilistic classifier’s predictions. The log loss is only defined for two or more labels. For a single sample with true label yt in {0,1} and estimated probability yp that yt = 1, the log loss is
    #-log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
    # Log loss penalizes both types of errors, but especially those predications that are confident and wrong! 
    # This is not good, since the wrongly predicted caravan (FP) at the time point of the extraction of the data may become TP!
    test_predictionsdropM = clf.predict_proba(X_TestdropM)
    lldropM = log_loss(y_TestdropM, test_predictionsdropM)
    print("Log LossdropM: {:.2f}".format(lldropM))
    
    test_predictionsdropM = clf.predict(X_TestdropM)
    
    # This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
    # Loss Functions for Classification: Zero-one loss, Logistic loss,Perceptron loss, Hinge Loss: not all the models use log_loss 
    BLoss= brier_score_loss(y_TestdropM, test_predictionsdropM)
    print("Brier score loss: {:.2f}".format(BLoss))
    confusiondropM = metrics.confusion_matrix(y_TestdropM, test_predictionsdropM)
    TNdropM = confusiondropM[0, 0]
    TPdropM = confusiondropM[1, 1]
    FNdropM = confusiondropM[1, 0]
    FPdropM = confusiondropM[0, 1]
    BenefitItemdropM = TPdropM 
    BenefitCodropM = TPdropM / (TPdropM + FNdropM) # this is specificity in statistics 
    print("BenefitItemdropM: {}".format(BenefitItemdropM))
    print("BenefitCodropM: {:.2%}".format(BenefitCodropM))

    
    CostItemdropM = (TPdropM + FPdropM)
    CostCodropM = (TPdropM + FPdropM) / (TPdropM + TNdropM + FPdropM +FNdropM) 
          
    print("CostItemdropM: {}".format(CostItemdropM))
    print("CostCodropM: {:.2%}".format(CostCodropM))
    if CostCodropM == 0: 
        ImproveRatiodropM = 0
    else: 
        ImproveRatiodropM = (BenefitItemdropM/CostItemdropM) /((TPdropM+FNdropM) /(TPdropM + TNdropM + FPdropM+FNdropM))       
    print(CBLUE+"ImproveRatiodropM: {:.2%}".format(ImproveRatiodropM)+CEND)
    
    #scenario BenefitItem*price-CostItem*
    if BenefitItemdropM == 0:
        balancetradeoffradiodropM = 0
    else:
        balancetradeoffradiodropM = float(format(CostItemdropM/BenefitItemdropM , '.2f'))
    if CostCodropM == 0: 
        ProfitratiodropM=0
    else:
        ProfitratiodropM = float(format(BenefitItemdropM/CostItemdropM, '.2f'))
    print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropM)+CEND)
    print(CBLUE+"ProfitratiodropM: {0:.2f}".format(ProfitratiodropM)+CEND)
 
    
    print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
    #   #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent 
    Budget = 200000
    #One scenario considers benefitgoal, one scenario considers the balance, no deficit
    ProfitGoal= 20000
    # Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management 
    ProfitPerBenefitItem = 700

    print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n')
# within this budget, the smallest amount of  target audience should be reached, in order to keep the balance of the account 
# Min CostItem
    if balancetradeoffradiodropM == 0:
        MinTargetAudienceBdropM = 0
    else:
        MinTargetAudienceBdropM= int((Budget/ProfitPerBenefitItem)*balancetradeoffradiodropM)
    
    #For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
    if MinTargetAudienceBdropM == 0:
        MaxBudgetPerTargetAudienceBdropM = 0
    else:
        MaxBudgetPerTargetAudienceBdropM = float(format(Budget/MinTargetAudienceBdropM, '.2f'))
    
    #To reach the benifit goal, at least MinTargetAudienceP should be reached 
    if balancetradeoffradiodropM == 0:
        MinTargetAudiencePdropM = 0
    else:
        MinTargetAudiencePdropM = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradiodropM)
    
    #To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent 
    if MinTargetAudienceBdropM == 0:
        MaxBudgetPerTargetAudiencePdropM = 0
    else:
        MaxBudgetPerTargetAudiencePdropM = float(format(Budget/MinTargetAudiencePdropM, '.2f'))
            
    print("MinTargetAudienceBdropM: {}".format(MinTargetAudienceBdropM))
    print("MaxBudgetPerTargetAudienceBdropM: {0:.2f}".format(MaxBudgetPerTargetAudienceBdropM)) 
    print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
    print("MinTargetAudiencePdropM: {}".format(MinTargetAudiencePdropM))
    print("MaxBudgetPerTargetAudiencePdropM: {0:.2f}".format(MaxBudgetPerTargetAudiencePdropM)+"\n")

    log_entryUdropM = pd.DataFrame([[namedropM, accdropM,acc2dropM]], columns=log_colsUdropM)#FPR*100,
    logUdropM = logUdropM.append(log_entryUdropM,ignore_index=True)
    
    log_entrydropM = pd.DataFrame([[namedropM,lldropM,OverfittingRoc,BLoss,BenefitItemdropM,BenefitCodropM, CostItemdropM, CostCodropM,ImproveRatiodropM,balancetradeoffradiodropM,ProfitratiodropM]], columns=log_colsdropM)#FPR*100,
    logdropM = logdropM.append(log_entrydropM,ignore_index=True)
        
    
    log_entryCBAdropM = pd.DataFrame([[namedropM,MinTargetAudienceBdropM,MaxBudgetPerTargetAudienceBdropM,MinTargetAudiencePdropM,MaxBudgetPerTargetAudiencePdropM]], columns=log_colsCBAdropM)
    logCBAdropM = logCBAdropM.append(log_entryCBAdropM,ignore_index=True)
   
    reportdropM = classification_report(y_TestdropM, test_predictionsdropM)
    print(reportdropM)
    confusion_matricesdropM = [
    ( "", confusion_matrix(y_TestdropM, test_predictionsdropM))
    ]
    draw_confusion_matricesdropM(confusion_matricesdropM,class_namesdropM)
    
    predictions = clf.predict(OutputdropM)
    print predictions
    #new_column = df['Classifiers'] 
#    pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
    with open('C:\Users\chenp\Desktop\output.4.3.1.1.csv', 'a') as csvfile:#, newline=''
                fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#, 
                fwriter.writerow(predictions)
    #numpy.savetxt('C:/localpath/test.csv',prediction, ,delimiter=',')
    #pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.output.csv', sep='\t',
==============================================================================================================
KNeighborsClassifier
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 18.75%
Model accuracydropM: 89.58% 
ROC just use this to check overfitting:
 
0.5624367683191213
0.8529641185647425
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 2.00
Brier score loss: 0.10
BenefitItemdropM: 21
BenefitCodropM: 17.65%
CostItemdropM: 105
CostCodropM: 6.01%
ImproveRatiodropM: 293.61%
balancetradeoffradio: 5.00
ProfitratiodropM: 0.20

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1428
MaxBudgetPerTargetAudienceBdropM: 140.06

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1571
MaxBudgetPerTargetAudiencePdropM: 127.31

             precision    recall  f1-score   support

          0       0.94      0.95      0.94      1628
          1       0.20      0.18      0.19       119

avg / total       0.89      0.90      0.89      1747

Confusion matrixdropM:
[[1544   84]
 [  98   21]]
[0 0 1 ... 0 0 0]
==============================================================================================================
AdaBoostClassifier
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 24.65%
Model accuracydropM: 78.31% 
ROC just use this to check overfitting:
 
0.6616098527863233
0.7909516380655226
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.68
Brier score loss: 0.22
BenefitItemdropM: 62
BenefitCodropM: 52.10%
CostItemdropM: 384
CostCodropM: 21.98%
ImproveRatiodropM: 237.03%
balancetradeoffradio: 6.19
ProfitratiodropM: 0.16

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1768
MaxBudgetPerTargetAudienceBdropM: 113.12

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1945
MaxBudgetPerTargetAudiencePdropM: 102.83

             precision    recall  f1-score   support

          0       0.96      0.80      0.87      1628
          1       0.16      0.52      0.25       119

avg / total       0.90      0.78      0.83      1747

Confusion matrixdropM:
[[1306  322]
 [  57   62]]
[1 1 1 ... 0 0 0]
==============================================================================================================
GradientBoostingClassifier
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.84%
Model accuracydropM: 88.44% 
ROC just use this to check overfitting:
 
0.5718724836371895
0.9082163286531462
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.35
Brier score loss: 0.12
BenefitItemdropM: 25
BenefitCodropM: 21.01%
CostItemdropM: 133
CostCodropM: 7.61%
ImproveRatiodropM: 275.95%
balancetradeoffradio: 5.32
ProfitratiodropM: 0.19

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1520
MaxBudgetPerTargetAudienceBdropM: 131.58

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1672
MaxBudgetPerTargetAudiencePdropM: 119.62

             precision    recall  f1-score   support

          0       0.94      0.93      0.94      1628
          1       0.19      0.21      0.20       119

avg / total       0.89      0.88      0.89      1747

Confusion matrixdropM:
[[1520  108]
 [  94   25]]
[0 0 0 ... 0 0 0]
==============================================================================================================
LGBMClassifier
LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.631179404427, importance_type='split',
        learning_rate=0.0278025184912, max_depth=-1, metric='auc',
        min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=50, reg_alpha=0.0618311835591,
        reg_lambda=0.247428314075, silent=True, subsample=0.999742610272,
        subsample_for_bin=280000, subsample_freq=1, verbose=1)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 23.11%
Model accuracydropM: 80.19% 
ROC just use this to check overfitting:
 
0.6327994342700225
0.8352834113364535
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.43
Brier score loss: 0.20
BenefitItemdropM: 52
BenefitCodropM: 43.70%
CostItemdropM: 331
CostCodropM: 18.95%
ImproveRatiodropM: 230.63%
balancetradeoffradio: 6.37
ProfitratiodropM: 0.16

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1820
MaxBudgetPerTargetAudienceBdropM: 109.89

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2002
MaxBudgetPerTargetAudiencePdropM: 99.90

             precision    recall  f1-score   support

          0       0.95      0.83      0.89      1628
          1       0.16      0.44      0.23       119

avg / total       0.90      0.80      0.84      1747

Confusion matrixdropM:
[[1349  279]
 [  67   52]]
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
[0 1 1 ... 1 1 0]
==============================================================================================================
DecisionTreeClassifier
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.61%
Model accuracydropM: 88.27% 
ROC just use this to check overfitting:
 
0.5709511077158136
0.8928757150286012
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.99
Brier score loss: 0.12
BenefitItemdropM: 25
BenefitCodropM: 21.01%
CostItemdropM: 136
CostCodropM: 7.78%
ImproveRatiodropM: 269.87%
balancetradeoffradio: 5.44
ProfitratiodropM: 0.18

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1554
MaxBudgetPerTargetAudienceBdropM: 128.70

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1709
MaxBudgetPerTargetAudiencePdropM: 117.03

             precision    recall  f1-score   support

          0       0.94      0.93      0.94      1628
          1       0.18      0.21      0.20       119

avg / total       0.89      0.88      0.89      1747

Confusion matrixdropM:
[[1517  111]
 [  94   25]]
[0 1 0 ... 0 0 0]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.6s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
RandomForestClassifier
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=1e-07,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
            verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 28.25%
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Model accuracydropM: 85.46% 
ROC just use this to check overfitting:
 
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
0.6532658517952636
0.861414456578263
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.38
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Brier score loss: 0.15
BenefitItemdropM: 50
BenefitCodropM: 42.02%
CostItemdropM: 235
CostCodropM: 13.45%
ImproveRatiodropM: 312.35%
balancetradeoffradio: 4.70
ProfitratiodropM: 0.21

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1342
MaxBudgetPerTargetAudienceBdropM: 149.03

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1477
MaxBudgetPerTargetAudiencePdropM: 135.41

             precision    recall  f1-score   support

          0       0.95      0.89      0.92      1628
          1       0.21      0.42      0.28       119

avg / total       0.90      0.85      0.88      1747

Confusion matrixdropM:
[[1443  185]
 [  69   50]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[0 1 1 ... 0 0 0]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
==============================================================================================================
ExtraTreesClassifier
ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample',
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=1e-07,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
           verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 28.57%
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
Model accuracydropM: 83.11% 
ROC just use this to check overfitting:
 
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
0.6757247124894183
0.8082423296931878
----------------Useful Scores: loss and cost-benefit scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Log LossdropM: 0.46
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Brier score loss: 0.17
BenefitItemdropM: 59
BenefitCodropM: 49.58%
CostItemdropM: 294
CostCodropM: 16.83%
ImproveRatiodropM: 294.61%
balancetradeoffradio: 4.98
ProfitratiodropM: 0.20

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1422
MaxBudgetPerTargetAudienceBdropM: 140.65

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1565
MaxBudgetPerTargetAudiencePdropM: 127.80

             precision    recall  f1-score   support

          0       0.96      0.86      0.90      1628
          1       0.20      0.50      0.29       119

avg / total       0.91      0.83      0.86      1747

Confusion matrixdropM:
[[1393  235]
 [  60   59]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[0 1 1 ... 0 0 0]
==============================================================================================================
LinearDiscriminantAnalysis
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 23.72%
Model accuracydropM: 70.18% 
ROC just use this to check overfitting:
 
0.6919946111122581
0.7011180447217888
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.57
Brier score loss: 0.30
BenefitItemdropM: 81
BenefitCodropM: 68.07%
CostItemdropM: 564
CostCodropM: 32.28%
ImproveRatiodropM: 210.84%
balancetradeoffradio: 6.96
ProfitratiodropM: 0.14

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1988
MaxBudgetPerTargetAudienceBdropM: 100.60

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2187
MaxBudgetPerTargetAudiencePdropM: 91.45

             precision    recall  f1-score   support

          0       0.97      0.70      0.81      1628
          1       0.14      0.68      0.24       119

avg / total       0.91      0.70      0.78      1747

Confusion matrixdropM:
[[1145  483]
 [  38   81]]
[0 1 1 ... 0 0 1]
==============================================================================================================
BernoulliNB
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 25.56%
Model accuracydropM: 73.33% 
ROC just use this to check overfitting:
 
0.7049919476390065
0.7028081123244929
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.70
Brier score loss: 0.27
BenefitItemdropM: 80
BenefitCodropM: 67.23%
CostItemdropM: 507
CostCodropM: 29.02%
ImproveRatiodropM: 231.65%
balancetradeoffradio: 6.34
ProfitratiodropM: 0.16

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1811
MaxBudgetPerTargetAudienceBdropM: 110.44

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1992
MaxBudgetPerTargetAudiencePdropM: 100.40

             precision    recall  f1-score   support

          0       0.97      0.74      0.84      1628
          1       0.16      0.67      0.26       119

avg / total       0.91      0.73      0.80      1747

Confusion matrixdropM:
[[1201  427]
 [  39   80]]
[0 1 1 ... 1 0 0]
[LibLinear]==============================================================================================================
LogisticRegression
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 23.37%
Model accuracydropM: 70.35% 
ROC just use this to check overfitting:
 
0.6851268763033468
0.6998179927197088
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.56
Brier score loss: 0.30
BenefitItemdropM: 79
BenefitCodropM: 66.39%
CostItemdropM: 557
CostCodropM: 31.88%
ImproveRatiodropM: 208.22%
balancetradeoffradio: 7.05
ProfitratiodropM: 0.14

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 2014
MaxBudgetPerTargetAudienceBdropM: 99.30

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2215
MaxBudgetPerTargetAudiencePdropM: 90.29

             precision    recall  f1-score   support

          0       0.97      0.71      0.82      1628
          1       0.14      0.66      0.23       119

avg / total       0.91      0.70      0.78      1747

Confusion matrixdropM:
[[1150  478]
 [  40   79]]
[0 1 1 ... 0 0 1]
[LibSVM]==============================================================================================================
SVC
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 24.50%
Model accuracydropM: 76.36% 
ROC just use this to check overfitting:
 
0.6706403691697809
0.7705408216328654
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.48
Brier score loss: 0.24
BenefitItemdropM: 67
BenefitCodropM: 56.30%
CostItemdropM: 428
CostCodropM: 24.50%
ImproveRatiodropM: 229.81%
balancetradeoffradio: 6.39
ProfitratiodropM: 0.16

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1825
MaxBudgetPerTargetAudienceBdropM: 109.59

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2008
MaxBudgetPerTargetAudiencePdropM: 99.60

             precision    recall  f1-score   support

          0       0.96      0.78      0.86      1628
          1       0.16      0.56      0.24       119

avg / total       0.91      0.76      0.82      1747

Confusion matrixdropM:
[[1267  361]
 [  52   67]]
[1 0 1 ... 0 0 0]

4.3.1.2 Train, test and evaluate the models with the datasets: Smote X_TrainselecdropM; y_TrainselecdropM

In [85]:
def draw_confusion_matricesselecdropM(confusion_matriciesselecdropM,class_namesselecdropM):
    class_namesselecdropM = class_namesselecdropM.tolist()
    for cm in confusion_matricesselecdropM:
        classifier, cm = cm[0], cm[1]
        print'Confusion matrixselecdropM:\n', cm              
        fig = plt.figure()
        ax = fig.add_subplot(111)
        sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells 
        plt.ylabel('True')
        plt.xlabel('Predicted')    
        ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
        plt.show()   
    
        
class_namesselecdropM = np.unique(np.array(y_TestselecdropM))

# Logging for Visual Comparison
log_colsselecUdropM = ["Classifier", "F-score","Accuracy"]
logselecUdropM = pd.DataFrame(columns=log_colsselecUdropM)

log_colsselecdropM=["Classifier", "Log Loss","OverfittingRoc", "BLoss","BI","BO", "CI", "CO", "IR","BTOR","PR"] 
logselecdropM = pd.DataFrame(columns=log_colsselecdropM)

log_colsCBAselecdropM =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAselecdropM = pd.DataFrame(columns=log_colsCBAselecdropM)

for clf in classifiers:
    clf.fit(X_TrainselecdropM, y_TrainselecdropM)
    nameselecdropM = clf.__class__.__name__
    
    print("="*110)
    print(nameselecdropM)
    
    print('****ResultsselecdropM****')
    print('\n----------------Unhelpful Scores\n')

    test_predictionsselecdropM = clf.predict(X_TestselecdropM)
    accselecdropM = f1_score(y_TestselecdropM, test_predictionsselecdropM)
    print("F-scoreselecdropM: {:.2%}".format(accselecdropM))
    
    test_predictionsselecdropM = clf.predict(X_TestselecdropM)
    acc2selecdropM = accuracy_score(y_TestselecdropM, test_predictionsselecdropM)

    print('Model accuracyselecdropM: {:.2%} '.format(acc2selecdropM))
    
    
    print('\n----------------Useful Scores: loss and cost-benefit scores\n')
    
    print('ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): \nIf both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.')

    RocScoreselec=roc_auc_score(y_TestselecdropM, clf.predict(X_TestselecdropM))
    fprBselec, tprBselec, thresholdsBselec = roc_curve(y_TestselecdropM, clf.predict_proba(X_TestselecdropM)[:,1])
    RocScoreTrainselec=roc_auc_score(y_TrainselecdropM, clf.predict(X_TrainselecdropM))
    fprBTrainselec, tprBTrainselec, thresholdsBTrainselec = roc_curve(y_TrainselecdropM, clf.predict_proba(X_TrainselecdropM)[:,1])
    
    OverfittingRocselec=float(format(RocScoreselec-RocScoreTrainselec,'.2f'))

    
    print RocScoreselec
    print RocScoreTrainselec
    plt.figure()
    plt.plot(fprBselec, tprBselec,  label='classifiersTest' % RocScoreselec)
    plt.plot(fprBTrainselec, tprBTrainselec, label='classifiersTrain' % RocScoreTrainselec)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()  
    
    test_predictionsdropM = clf.predict_proba(X_TestselecdropM)
    llselecdropM = log_loss(y_TestselecdropM, test_predictionsdropM)
    print("Log LossselecdropM: {:.2f}".format(llselecdropM))

    test_predictionsdropM = clf.predict(X_TestselecdropM)
    
    # This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
    BLossselec= brier_score_loss(y_TestselecdropM, test_predictionsselecdropM)
    print("Brier score loss: {:.2f}".format(BLossselec))
    
    test_predictionsselecdropM = clf.predict(X_TestselecdropM)
    confusionselecdropM = metrics.confusion_matrix(y_TestselecdropM, test_predictionsselecdropM)
    TPselecdropM = confusionselecdropM[0, 0]
    TNselecdropM = confusionselecdropM[1, 1]
    FPselecdropM = confusionselecdropM[1, 0]
    FNselecdropM = confusionselecdropM[0, 1]
    BenefitItemselecdropM = TNselecdropM 
    BenefitCoselecdropM = TNselecdropM / (TNselecdropM + FPselecdropM) # this is specificity in statistics 
    print("BenefitItemselecdropM: {}".format(BenefitItemselecdropM))
    print("BenefitCoselecdropM: {:.2%}".format(BenefitCoselecdropM))
    
    CostItemselecdropM = (TNselecdropM + FNselecdropM)
    CostCoselecdropM = (TNselecdropM + FNselecdropM) / (TPselecdropM + TNselecdropM + FPselecdropM +FNselecdropM) 
          
    print("CostItemselecdropM: {}".format(CostItemselecdropM))
    print("CostCoselecdropM: {:.2%}".format(CostCoselecdropM))
    if CostCoselecdropM == 0: 
        ImproveRatioselecdropM = 0
    else: 
        ImproveRatioselecdropM = (BenefitItemselecdropM/CostItemselecdropM) /((TNselecdropM+FPselecdropM) /(TPselecdropM + TNselecdropM + FPselecdropM +FNselecdropM))   
    print(CBLUE+"ImproveRatioselecdropM: {:.2%}".format(ImproveRatioselecdropM)+CEND)
    
    #scenario BenefitItem*price-CostItem*
    if BenefitItemselecdropM == 0:
        balancetradeoffradioselecdropM = 0
    else:
        balancetradeoffradioselecdropM = float(format(CostItemselecdropM/BenefitItemselecdropM , '.2f'))
    if CostCoselecdropM == 0: 
        ProfitratioselecdropM=0
    else:
        ProfitratioselecdropM = float(format(BenefitItemselecdropM/CostItemselecdropM, '.2f'))
    print(CBLUE+"balancetradeoffradioselecdropM: {0:.2f}".format(balancetradeoffradioselecdropM)+CEND)
    print(CBLUE+"ProfitratioselecdropM: {0:.2f}".format(ProfitratioselecdropM)+CEND)

    
    print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
    #   #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent 
    Budget = 200000
    #One scenario considers benefitgoal, one scenario considers the balance, no deficit
    ProfitGoal= 20000
    # Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management 
    ProfitPerBenefitItem = 700

    print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+CBLUE+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n'+CEND)
# within this budget, the smallest amount of  target audience should be reached, in order to keep the balance of the account 
# Min CostItem
    if balancetradeoffradioselecdropM == 0:
        MinTargetAudienceBselecdropM = 0
    else:
        MinTargetAudienceBselecdropM= int((Budget/ProfitPerBenefitItem)*balancetradeoffradioselecdropM)
    
    #For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
    if MinTargetAudienceBselecdropM == 0:
        MaxBudgetPerTargetAudienceBselecdropM = 0
    else:
        MaxBudgetPerTargetAudienceBselecdropM = float(format(Budget/MinTargetAudienceBselecdropM, '.2f'))
    
    #To reach the benifit goal, at least MinTargetAudienceP should be reached 
    if balancetradeoffradioselecdropM == 0:
        MinTargetAudiencePselecdropM = 0
    else:
        MinTargetAudiencePselecdropM = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradioselecdropM)
    
    #To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent 
    if MinTargetAudienceBselecdropM == 0:
        MaxBudgetPerTargetAudiencePselecdropM = 0
    else:
        MaxBudgetPerTargetAudiencePselecdropM = float(format(Budget/MinTargetAudiencePselecdropM, '.2f'))
    
    
    print(CRED+"MinTargetAudienceBselecdropM: {}".format(MinTargetAudienceBselecdropM)+CEND)
    print(CRED+"MaxBudgetPerTargetAudienceBselecdropM: {0:.2f}".format(MaxBudgetPerTargetAudienceBselecdropM)+CEND) 
    print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
    print(CRED+"MinTargetAudiencePselecdropM: {}".format(MinTargetAudiencePselecdropM)+CEND)
    print(CRED+"MaxBudgetPerTargetAudiencePselecdropM: {0:.2f}".format(MaxBudgetPerTargetAudiencePselecdropM)+CEND+"\n")

    log_entryselecUdropM = pd.DataFrame([[nameselecdropM, accselecdropM,acc2selecdropM]], columns=log_colsselecUdropM)#FPR*100,
    logselecUdropM = logselecUdropM.append(log_entryselecUdropM,ignore_index=True)

    log_entryselecdropM = pd.DataFrame([[nameselecdropM,llselecdropM,OverfittingRocselec, BLossselec,BenefitItemselecdropM,BenefitCoselecdropM, CostItemselecdropM, CostCoselecdropM,ImproveRatioselecdropM,balancetradeoffradioselecdropM,ProfitratioselecdropM]], columns=log_colsselecdropM)#FPR*100,
    logselecdropM = logselecdropM.append(log_entryselecdropM,ignore_index=True)
        
    
    log_entryCBAselecdropM = pd.DataFrame([[nameselecdropM,MinTargetAudienceBselecdropM,MaxBudgetPerTargetAudienceBselecdropM,MinTargetAudiencePselecdropM,MaxBudgetPerTargetAudiencePselecdropM]], columns=log_colsCBAselecdropM)
    logCBAselecdropM = logCBAselecdropM.append(log_entryCBAselecdropM,ignore_index=True)
    
    
    reportselecdropM = classification_report(y_TestselecdropM, test_predictionsselecdropM)
    print(reportselecdropM)
    confusion_matricesselecdropM = [
    ( "", confusion_matrix(y_TestselecdropM, test_predictionsselecdropM))
    ]
    draw_confusion_matricesselecdropM(confusion_matricesselecdropM,class_namesselecdropM)
    
    print("="*60)
    print(str(clf));print('\n')
    
    predictions = clf.predict(OutputselecdropM)
    print predictions
    #new_column = df['Classifiers'] 
#    pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
    with open('C:\Users\chenp\Desktop\output.selec.4.3.1.2.csv', 'a') as csvfile:#, newline=''
                fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#, 
                fwriter.writerow(predictions)
==============================================================================================================
KNeighborsClassifier
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 3.12%
Model accuracyselecdropM: 92.90% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.5062534841946607
0.5313210345007687
Log LossselecdropM: 1.65
Brier score loss: 0.07
BenefitItemselecdropM: 2
BenefitCoselecdropM: 1.68%
CostItemselecdropM: 9
CostCoselecdropM: 0.52%
ImproveRatioselecdropM: 326.24%
balancetradeoffradioselecdropM: 4.50
ProfitratioselecdropM: 0.22

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 1285
MaxBudgetPerTargetAudienceBselecdropM: 155.64

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 1414
MaxBudgetPerTargetAudiencePselecdropM: 141.44

             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1628
          1       0.22      0.02      0.03       119

avg / total       0.88      0.93      0.90      1747

Confusion matrixselecdropM:
[[1621    7]
 [ 117    2]]
============================================================
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


[0 0 0 ... 0 0 0]
==============================================================================================================
AdaBoostClassifier
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 0.00%
Model accuracyselecdropM: 93.19% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.5
0.5057701871393633
Log LossselecdropM: 0.66
Brier score loss: 0.07
BenefitItemselecdropM: 0
BenefitCoselecdropM: 0.00%
CostItemselecdropM: 0
CostCoselecdropM: 0.00%
ImproveRatioselecdropM: 0.00%
balancetradeoffradioselecdropM: 0.00
ProfitratioselecdropM: 0.00

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 0
MaxBudgetPerTargetAudienceBselecdropM: 0.00

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 0
MaxBudgetPerTargetAudiencePselecdropM: 0.00

             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1628
          1       0.00      0.00      0.00       119

avg / total       0.87      0.93      0.90      1747

Confusion matrixselecdropM:
[[1628    0]
 [ 119    0]]
============================================================
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)


[0 0 0 ... 0 0 0]
==============================================================================================================
GradientBoostingClassifier
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 14.44%
Model accuracyselecdropM: 91.18% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.5398798339974811
0.612987008563312
Log LossselecdropM: 1.07
Brier score loss: 0.09
BenefitItemselecdropM: 13
BenefitCoselecdropM: 10.92%
CostItemselecdropM: 61
CostCoselecdropM: 3.49%
ImproveRatioselecdropM: 312.87%
balancetradeoffradioselecdropM: 4.69
ProfitratioselecdropM: 0.21

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 1340
MaxBudgetPerTargetAudienceBselecdropM: 149.25

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 1474
MaxBudgetPerTargetAudiencePselecdropM: 135.69

             precision    recall  f1-score   support

          0       0.94      0.97      0.95      1628
          1       0.21      0.11      0.14       119

avg / total       0.89      0.91      0.90      1747

Confusion matrixselecdropM:
[[1580   48]
 [ 106   13]]
============================================================
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


[0 1 0 ... 0 0 0]
==============================================================================================================
LGBMClassifier
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 0.00%
Model accuracyselecdropM: 93.19% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
0.5
0.5
Log LossselecdropM: 0.22
Brier score loss: 0.07
BenefitItemselecdropM: 0
BenefitCoselecdropM: 0.00%
CostItemselecdropM: 0
CostCoselecdropM: 0.00%
ImproveRatioselecdropM: 0.00%
balancetradeoffradioselecdropM: 0.00
ProfitratioselecdropM: 0.00

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 0
MaxBudgetPerTargetAudienceBselecdropM: 0.00

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 0
MaxBudgetPerTargetAudiencePselecdropM: 0.00

             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1628
          1       0.00      0.00      0.00       119

avg / total       0.87      0.93      0.90      1747

Confusion matrixselecdropM:
[[1628    0]
 [ 119    0]]
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
============================================================
LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.631179404427, importance_type='split',
        learning_rate=0.0278025184912, max_depth=-1, metric='auc',
        min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=50, reg_alpha=0.0618311835591,
        reg_lambda=0.247428314075, silent=True, subsample=0.999742610272,
        subsample_for_bin=280000, subsample_freq=1, verbose=1)


[0 0 0 ... 0 0 0]
==============================================================================================================
DecisionTreeClassifier
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 4.58%
Model accuracyselecdropM: 92.84% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.5098409142526789
0.5610053659788313
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
Log LossselecdropM: 0.78
Brier score loss: 0.07
BenefitItemselecdropM: 3
BenefitCoselecdropM: 2.52%
CostItemselecdropM: 12
CostCoselecdropM: 0.69%
ImproveRatioselecdropM: 367.02%
balancetradeoffradioselecdropM: 4.00
ProfitratioselecdropM: 0.25

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 1142
MaxBudgetPerTargetAudienceBselecdropM: 175.13

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 1257
MaxBudgetPerTargetAudiencePselecdropM: 159.11

             precision    recall  f1-score   support

          0       0.93      0.99      0.96      1628
          1       0.25      0.03      0.05       119

avg / total       0.89      0.93      0.90      1747

Confusion matrixselecdropM:
[[1619    9]
 [ 116    3]]
============================================================
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


[0 0 0 ... 0 0 0]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.4s finished
==============================================================================================================
RandomForestClassifier
****ResultsselecdropM****

----------------Unhelpful Scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
F-scoreselecdropM: 28.43%
Model accuracyselecdropM: 83.57% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
0.6703926042161337
0.7769320816500782
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
Log LossselecdropM: 0.45
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Brier score loss: 0.16
BenefitItemselecdropM: 57
BenefitCoselecdropM: 47.90%
CostItemselecdropM: 282
CostCoselecdropM: 16.14%
ImproveRatioselecdropM: 296.74%
balancetradeoffradioselecdropM: 4.95
ProfitratioselecdropM: 0.20

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 1414
MaxBudgetPerTargetAudienceBselecdropM: 141.44

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 1555
MaxBudgetPerTargetAudiencePselecdropM: 128.62

             precision    recall  f1-score   support

          0       0.96      0.86      0.91      1628
          1       0.20      0.48      0.28       119

avg / total       0.91      0.84      0.86      1747

Confusion matrixselecdropM:
[[1403  225]
 [  62   57]]
============================================================
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=1e-07,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
            verbose=1, warm_start=False)


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[0 1 1 ... 0 0 0]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
ExtraTreesClassifier
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 28.79%
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Model accuracyselecdropM: 84.14% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
0.6695693019222432
0.7755287067377892
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Log LossselecdropM: 0.42
Brier score loss: 0.16
BenefitItemselecdropM: 56
BenefitCoselecdropM: 47.06%
CostItemselecdropM: 270
CostCoselecdropM: 15.46%
ImproveRatioselecdropM: 304.49%
balancetradeoffradioselecdropM: 4.82
ProfitratioselecdropM: 0.21

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 1377
MaxBudgetPerTargetAudienceBselecdropM: 145.24

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 1514
MaxBudgetPerTargetAudiencePselecdropM: 132.10

             precision    recall  f1-score   support

          0       0.96      0.87      0.91      1628
          1       0.21      0.47      0.29       119

avg / total       0.91      0.84      0.87      1747

Confusion matrixselecdropM:
[[1414  214]
 [  63   56]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
============================================================
ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample',
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=1e-07,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
           verbose=1, warm_start=False)


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[0 1 1 ... 0 0 0]
==============================================================================================================
LinearDiscriminantAnalysis
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 4.69%
Model accuracyselecdropM: 93.02% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.5107622901740548
0.519857300842252
Log LossselecdropM: 0.23
Brier score loss: 0.07
BenefitItemselecdropM: 3
BenefitCoselecdropM: 2.52%
CostItemselecdropM: 9
CostCoselecdropM: 0.52%
ImproveRatioselecdropM: 489.36%
balancetradeoffradioselecdropM: 3.00
ProfitratioselecdropM: 0.33

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 857
MaxBudgetPerTargetAudienceBselecdropM: 233.37

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 942
MaxBudgetPerTargetAudiencePselecdropM: 212.31

             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1628
          1       0.33      0.03      0.05       119

avg / total       0.89      0.93      0.90      1747

Confusion matrixselecdropM:
[[1622    6]
 [ 116    3]]
============================================================
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


[0 0 0 ... 0 0 0]
==============================================================================================================
BernoulliNB
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 0.00%
Model accuracyselecdropM: 93.19% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.5
0.5031967654252022
Log LossselecdropM: 0.23
Brier score loss: 0.07
BenefitItemselecdropM: 0
BenefitCoselecdropM: 0.00%
CostItemselecdropM: 0
CostCoselecdropM: 0.00%
ImproveRatioselecdropM: 0.00%
balancetradeoffradioselecdropM: 0.00
ProfitratioselecdropM: 0.00

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 0
MaxBudgetPerTargetAudienceBselecdropM: 0.00

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 0
MaxBudgetPerTargetAudiencePselecdropM: 0.00

             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1628
          1       0.00      0.00      0.00       119

avg / total       0.87      0.93      0.90      1747

Confusion matrixselecdropM:
[[1628    0]
 [ 119    0]]
============================================================
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


[0 0 0 ... 0 0 0]
[LibLinear]==============================================================================================================
LogisticRegression
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 24.09%
Model accuracyselecdropM: 70.06% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.6991694712282948
0.7011589197192342
Log LossselecdropM: 0.57
Brier score loss: 0.30
BenefitItemselecdropM: 83
BenefitCoselecdropM: 69.75%
CostItemselecdropM: 570
CostCoselecdropM: 32.63%
ImproveRatioselecdropM: 213.77%
balancetradeoffradioselecdropM: 6.87
ProfitratioselecdropM: 0.15

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 1962
MaxBudgetPerTargetAudienceBselecdropM: 101.94

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 2159
MaxBudgetPerTargetAudiencePselecdropM: 92.64

             precision    recall  f1-score   support

          0       0.97      0.70      0.81      1628
          1       0.15      0.70      0.24       119

avg / total       0.91      0.70      0.77      1747

Confusion matrixselecdropM:
[[1141  487]
 [  36   83]]
============================================================
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)


[0 1 1 ... 0 0 1]
[LibSVM]==============================================================================================================
SVC
****ResultsselecdropM****

----------------Unhelpful Scores

F-scoreselecdropM: 26.90%
Model accuracyselecdropM: 78.53% 

----------------Useful Scores: loss and cost-benefit scores

ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): 
If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
0.6901002415708298
0.7565411350078457
Log LossselecdropM: 0.23
Brier score loss: 0.21
BenefitItemselecdropM: 69
BenefitCoselecdropM: 57.98%
CostItemselecdropM: 394
CostCoselecdropM: 22.55%
ImproveRatioselecdropM: 257.10%
balancetradeoffradioselecdropM: 5.71
ProfitratioselecdropM: 0.18

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBselecdropM: 1631
MaxBudgetPerTargetAudienceBselecdropM: 122.62

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePselecdropM: 1794
MaxBudgetPerTargetAudiencePselecdropM: 111.48

             precision    recall  f1-score   support

          0       0.96      0.80      0.87      1628
          1       0.18      0.58      0.27       119

avg / total       0.91      0.79      0.83      1747

Confusion matrixselecdropM:
[[1303  325]
 [  50   69]]
============================================================
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=True)


[1 1 1 ... 0 0 0]

4.3.1.3 Train, test and evaluate the models with the datasets: Smote X_TraindropM2, y_TraindropM2

In [110]:
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
from __future__ import division

def draw_confusion_matricesdropM2(confusion_matricesdropM2,class_namesdropM2):
    class_namesdropM2 = class_namesdropM2.tolist()
    for cm in confusion_matricesdropM2:
        classifier, cm = cm[0], cm[1]
        print'Confusion matrixdropM:\n', cm              
        fig = plt.figure()
        ax = fig.add_subplot(111)
        sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells 
        plt.ylabel('True')
        plt.xlabel('Predicted')    
        ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
        plt.show()   
    
        
class_namesdropM2 = np.unique(np.array(y_TestdropM2))

# Logging for Visual Comparison
log_colsUdropM2 = ["Classifier", "F-score","Accuracy"]
logUdropM2 = pd.DataFrame(columns=log_colsUdropM2)

log_colsdropM2=["Classifier","Log Loss","OverfittingRoc", "BLoss2","BI","BO", "CI", "CO", "IR","BTOR","PR"] 
logdropM2 = pd.DataFrame(columns=log_colsdropM2)

log_colsCBAdropM2 =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAdropM2 = pd.DataFrame(columns=log_colsCBAdropM2)


CBLACK  = '\33[30m'
CRED    = '\33[31m'
CGREEN  = '\33[32m'
CYELLOW = '\33[33m'
CBLUE   = '\33[34m'
CVIOLET = '\33[35m'
CBEIGE  = '\33[36m'
CWHITE  = '\33[37m'

CBLACKBG  = '\33[40m'
CREDBG    = '\33[41m'
CGREENBG  = '\33[42m'
CYELLOWBG = '\33[43m'
CBLUEBG   = '\33[44m'
CVIOLETBG = '\33[45m'
CBEIGEBG  = '\33[46m'
CWHITEBG  = '\33[47m'

CGREY    = '\33[90m'
CEND = '\033[0m'

for clf in classifiers:
    clf.fit(X_TraindropM2, y_TraindropM2)
    namedropM2 = clf.__class__.__name__
    
    print("="*110)
    print(namedropM2)
    print(str(clf));print('\n')
    
    print('****************ResultsdropM****************')
    print('\n----------------Unhelpful Scores\n')

    test_predictionsdropM2 = clf.predict(X_TestdropM2)
    accdropM2 = f1_score(y_TestdropM2, test_predictionsdropM2)
    print("F-scoredropM: {:.2%}".format(accdropM2))
    
    test_predictionsdropM2 = clf.predict(X_TestdropM2)
    acc2dropM2 = accuracy_score(y_TestdropM, test_predictionsdropM2)

    print('Model accuracydropM: {:.2%} '.format(acc2dropM2))
     
    
    print('ROC just use this to check overfitting: \n')
#If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
    RocScore2=roc_auc_score(y_TestdropM2, test_predictionsdropM2)
    fprB2, tprB2, thresholdsB2 = roc_curve(y_TestdropM2, clf.predict_proba(X_TestdropM2)[:,1])
    RocScoreTrain2=roc_auc_score(y_TraindropM2, clf.predict(X_TraindropM2))
    fprBTrain2, tprBTrain2, thresholdsBTrain2 = roc_curve(y_TraindropM2, clf.predict_proba(X_TraindropM2)[:,1])
    
    OverfittingRoc2=float(format(RocScore2-RocScoreTrain2,'.2f'))

    
    print RocScore2
    print RocScoreTrain2
    plt.figure()
    plt.plot(fprB2, tprB2, label='classifiersTest' % RocScore2)
    plt.plot(fprBTrain2, tprBTrain2, label='classifiersTrain' % RocScoreTrain2)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    
    print('\n----------------Useful Scores: loss and cost-benefit scores\n')
    test_predictionsdropM2 = clf.predict_proba(X_TestdropM2)
    lldropM2 = log_loss(y_TestdropM, test_predictionsdropM2)
    print("Log LossdropM: {:.2f}".format(lldropM2))
    
    test_predictionsdropM2 = clf.predict(X_TestdropM2)
    
    # This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
    BLoss2= brier_score_loss(y_TestdropM2, test_predictionsdropM2)
    print("Brier score loss: {:.2f}".format(BLoss2))
    
    test_predictionsdropM2 = clf.predict(X_TestdropM2)
    confusiondropM2 = metrics.confusion_matrix(y_TestdropM2, test_predictionsdropM2)
    TPdropM2 = confusiondropM2[0, 0]
    TNdropM2 = confusiondropM2[1, 1]
    FPdropM2 = confusiondropM2[1, 0]
    FNdropM2 = confusiondropM2[0, 1]
    BenefitItemdropM2 = TNdropM2 
    BenefitCodropM2 = TNdropM2 / (TNdropM2 + FPdropM2) # this is specificity in statistics 
    print("BenefitItemdropM: {}".format(BenefitItemdropM2))
    print("BenefitCodropM: {:.2%}".format(BenefitCodropM2))
    
    CostItemdropM2 = (TNdropM2 + FNdropM2)
    CostCodropM2 = (TNdropM2 + FNdropM2) / (TPdropM2 + TNdropM2 + FPdropM2 +FNdropM2) 
          
    print("CostItemdropM: {}".format(CostItemdropM2))
    print("CostCodropM: {:.2%}".format(CostCodropM2))
        
    ImproveRatiodropM2 = (BenefitItemdropM2/CostItemdropM2) /((TNdropM2+FPdropM2) /(TPdropM2 + TNdropM2 + FPdropM2+FNdropM2))   
    print("ImproveRatio: {:.2%}".format(ImproveRatiodropM2))
    
    #scenario BenefitItem*price-CostItem*
    balancetradeoffradiodropM2 = float(format(CostItemdropM2/BenefitItemdropM2 , '.2f'))
    print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropM2)+CEND)
    ProfitratiodropM2 = float(format(BenefitItemdropM/CostItemdropM2, '.2f'))
    print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropM2)+CEND)
    print(CBLUE+"ProfitratiodropM: {0:.2f}".format(ProfitratiodropM2)+CEND)
    
    print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
    #   #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent 
    Budget = 200000
    #One scenario considers benefitgoal, one scenario considers the balance, no deficit
    ProfitGoal= 20000
    # Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management 
    ProfitPerBenefitItem = 700

    print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n')
# within this budget, the smallest amount of  target audience should be reached, in order to keep the balance of the account 
# Min CostItem
    MinTargetAudienceBdropM2= int((Budget/ProfitPerBenefitItem)*balancetradeoffradiodropM2)
    
    #For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
    MaxBudgetPerTargetAudienceBdropM2 = float(format(Budget/MinTargetAudienceBdropM2, '.2f'))
    
    #To reach the benifit goal, at least MinTargetAudienceP should be reached 
    MinTargetAudiencePdropM2 = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradiodropM2)
    
    #To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent 
    MaxBudgetPerTargetAudiencePdropM2 = float(format(Budget/MinTargetAudiencePdropM2, '.2f'))
    
    
    print("MinTargetAudienceBdropM: {}".format(MinTargetAudienceBdropM2))
    print("MaxBudgetPerTargetAudienceBdropM: {0:.2f}".format(MaxBudgetPerTargetAudienceBdropM2)) 
    print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
    print("MinTargetAudiencePdropM: {}".format(MinTargetAudiencePdropM2))
    print("MaxBudgetPerTargetAudiencePdropM: {0:.2f}".format(MaxBudgetPerTargetAudiencePdropM2)+"\n")

    log_entryUdropM2 = pd.DataFrame([[namedropM2, accdropM2,acc2dropM2]], columns=log_colsUdropM2)#FPR*100,
    logUdropM2 = logUdropM2.append(log_entryUdropM2,ignore_index=True)
    
    log_entrydropM2 = pd.DataFrame([[namedropM2,lldropM2,OverfittingRoc2,BLoss2,BenefitItemdropM2,BenefitCodropM2, CostItemdropM2, CostCodropM2,ImproveRatiodropM2,balancetradeoffradiodropM2,ProfitratiodropM2]], columns=log_colsdropM2)#FPR*100,
    logdropM2 = logdropM2.append(log_entrydropM2,ignore_index=True)
        
    
    log_entryCBAdropM2 = pd.DataFrame([[namedropM2,MinTargetAudienceBdropM2,MaxBudgetPerTargetAudienceBdropM2,MinTargetAudiencePdropM2,MaxBudgetPerTargetAudiencePdropM2]], columns=log_colsCBAdropM2)
    logCBAdropM2 = logCBAdropM2.append(log_entryCBAdropM2,ignore_index=True)
   
    reportdropM2 = classification_report(y_TestdropM2, test_predictionsdropM2)
    print(reportdropM2)
    confusion_matricesdropM2 = [
    ( "", confusion_matrix(y_TestdropM2, test_predictionsdropM2))
    ]
    draw_confusion_matricesdropM2(confusion_matricesdropM2,class_namesdropM2)
    predictions = clf.predict(OutputdropM2)
    print predictions
    #new_column = df['Classifiers'] 
#    pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
    with open('C:\Users\chenp\Desktop\output2.4.3.1.3.csv', 'a') as csvfile:#, newline=''
                fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#, 
                fwriter.writerow(predictions)
==============================================================================================================
KNeighborsClassifier
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 12.37%
Model accuracydropM: 90.27% 
ROC just use this to check overfitting: 

0.5310712737183325
0.7849713988559542
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 2.16
Brier score loss: 0.10
BenefitItemdropM: 12
BenefitCodropM: 10.08%
CostItemdropM: 75
CostCodropM: 4.29%
ImproveRatio: 234.89%
balancetradeoffradio: 6.25
balancetradeoffradio: 6.25
ProfitratiodropM: 0.89

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1785
MaxBudgetPerTargetAudienceBdropM: 112.04

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1964
MaxBudgetPerTargetAudiencePdropM: 101.83

             precision    recall  f1-score   support

          0       0.94      0.96      0.95      1628
          1       0.16      0.10      0.12       119

avg / total       0.88      0.90      0.89      1747

Confusion matrixdropM:
[[1565   63]
 [ 107   12]]
[0 1 1 ... 0 0 0]
==============================================================================================================
AdaBoostClassifier
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 26.53%
Model accuracydropM: 75.90% 
ROC just use this to check overfitting: 

0.703234364999071
0.7758710348413936
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.68
Brier score loss: 0.24
BenefitItemdropM: 76
BenefitCodropM: 63.87%
CostItemdropM: 454
CostCodropM: 25.99%
ImproveRatio: 245.76%
balancetradeoffradio: 5.97
balancetradeoffradio: 5.97
ProfitratiodropM: 0.15

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1705
MaxBudgetPerTargetAudienceBdropM: 117.30

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1876
MaxBudgetPerTargetAudiencePdropM: 106.61

             precision    recall  f1-score   support

          0       0.97      0.77      0.86      1628
          1       0.17      0.64      0.27       119

avg / total       0.91      0.76      0.82      1747

Confusion matrixdropM:
[[1250  378]
 [  43   76]]
[0 1 1 ... 0 0 0]
==============================================================================================================
GradientBoostingClassifier
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.80%
Model accuracydropM: 88.09% 
ROC just use this to check overfitting: 

0.585607953255012
0.8965158606344253
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.36
Brier score loss: 0.12
BenefitItemdropM: 29
BenefitCodropM: 24.37%
CostItemdropM: 147
CostCodropM: 8.41%
ImproveRatio: 289.62%
balancetradeoffradio: 5.07
balancetradeoffradio: 5.07
ProfitratiodropM: 0.46

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1448
MaxBudgetPerTargetAudienceBdropM: 138.12

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1593
MaxBudgetPerTargetAudiencePdropM: 125.55

             precision    recall  f1-score   support

          0       0.94      0.93      0.94      1628
          1       0.20      0.24      0.22       119

avg / total       0.89      0.88      0.89      1747

Confusion matrixdropM:
[[1510  118]
 [  90   29]]
[0 0 0 ... 0 0 0]
==============================================================================================================
LGBMClassifier
LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.631179404427, importance_type='split',
        learning_rate=0.0278025184912, max_depth=-1, metric='auc',
        min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=50, reg_alpha=0.0618311835591,
        reg_lambda=0.247428314075, silent=True, subsample=0.999742610272,
        subsample_for_bin=280000, subsample_freq=1, verbose=1)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 24.76%
Model accuracydropM: 77.73% 
ROC just use this to check overfitting: 

c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
0.6663277104453574
0.8095423816952678
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.46
Brier score loss: 0.22
BenefitItemdropM: 64
BenefitCodropM: 53.78%
CostItemdropM: 398
CostCodropM: 22.78%
ImproveRatio: 236.07%
balancetradeoffradio: 6.22
balancetradeoffradio: 6.22
ProfitratiodropM: 0.17

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1777
MaxBudgetPerTargetAudienceBdropM: 112.55

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1954
MaxBudgetPerTargetAudiencePdropM: 102.35

             precision    recall  f1-score   support

          0       0.96      0.79      0.87      1628
          1       0.16      0.54      0.25       119

avg / total       0.90      0.78      0.83      1747

Confusion matrixdropM:
[[1294  334]
 [  55   64]]
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
[0 1 1 ... 1 1 0]
==============================================================================================================
DecisionTreeClassifier
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.53%
Model accuracydropM: 81.22% 
ROC just use this to check overfitting: 

0.6110658022422728
0.874544981799272
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 1.06
Brier score loss: 0.19
BenefitItemdropM: 45
BenefitCodropM: 37.82%
CostItemdropM: 299
CostCodropM: 17.12%
ImproveRatio: 220.95%
balancetradeoffradio: 6.64
balancetradeoffradio: 6.64
ProfitratiodropM: 0.22

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1897
MaxBudgetPerTargetAudienceBdropM: 105.43

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2086
MaxBudgetPerTargetAudiencePdropM: 95.88

             precision    recall  f1-score   support

          0       0.95      0.84      0.89      1628
          1       0.15      0.38      0.22       119

avg / total       0.89      0.81      0.85      1747

Confusion matrixdropM:
[[1374  254]
 [  74   45]]
[0 0 0 ... 0 1 0]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
RandomForestClassifier
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=1e-07,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
            verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 27.76%
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Model accuracydropM: 85.40% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
0.6490641711229946
0.874154966198648
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.39
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Brier score loss: 0.15
BenefitItemdropM: 49
BenefitCodropM: 41.18%
CostItemdropM: 234
CostCodropM: 13.39%
ImproveRatio: 307.42%
balancetradeoffradio: 4.78
balancetradeoffradio: 4.78
ProfitratiodropM: 0.29

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1365
MaxBudgetPerTargetAudienceBdropM: 146.52

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1502
MaxBudgetPerTargetAudiencePdropM: 133.16

             precision    recall  f1-score   support

          0       0.95      0.89      0.92      1628
          1       0.21      0.41      0.28       119

avg / total       0.90      0.85      0.88      1747

Confusion matrixdropM:
[[1443  185]
 [  70   49]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[0 1 1 ... 0 0 0]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
ExtraTreesClassifier
ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample',
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=1e-07,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
           verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 29.03%
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Model accuracydropM: 82.37% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
0.6873103049573638
0.8195527821112844
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.47
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Brier score loss: 0.18
BenefitItemdropM: 63
BenefitCodropM: 52.94%
CostItemdropM: 315
CostCodropM: 18.03%
ImproveRatio: 293.61%
balancetradeoffradio: 5.00
balancetradeoffradio: 5.00
ProfitratiodropM: 0.21

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1428
MaxBudgetPerTargetAudienceBdropM: 140.06

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 1571
MaxBudgetPerTargetAudiencePdropM: 127.31

             precision    recall  f1-score   support

          0       0.96      0.85      0.90      1628
          1       0.20      0.53      0.29       119

avg / total       0.91      0.82      0.86      1747

Confusion matrixdropM:
[[1376  252]
 [  56   63]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[0 1 1 ... 0 0 0]
==============================================================================================================
LinearDiscriminantAnalysis
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 24.15%
Model accuracydropM: 68.00% 
ROC just use this to check overfitting: 

0.7114802923626453
0.6783671346853875
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.58
Brier score loss: 0.32
BenefitItemdropM: 89
BenefitCodropM: 74.79%
CostItemdropM: 618
CostCodropM: 35.37%
ImproveRatio: 211.42%
balancetradeoffradio: 6.94
balancetradeoffradio: 6.94
ProfitratiodropM: 0.11

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1982
MaxBudgetPerTargetAudienceBdropM: 100.91

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2181
MaxBudgetPerTargetAudiencePdropM: 91.70

             precision    recall  f1-score   support

          0       0.97      0.68      0.80      1628
          1       0.14      0.75      0.24       119

avg / total       0.92      0.68      0.76      1747

Confusion matrixdropM:
[[1099  529]
 [  30   89]]
[0 1 1 ... 0 0 1]
==============================================================================================================
BernoulliNB
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 25.20%
Model accuracydropM: 72.81% 
ROC just use this to check overfitting: 

0.7022278198748787
0.6964378575143004
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.60
Brier score loss: 0.27
BenefitItemdropM: 80
BenefitCodropM: 67.23%
CostItemdropM: 516
CostCodropM: 29.54%
ImproveRatio: 227.61%
balancetradeoffradio: 6.45
balancetradeoffradio: 6.45
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1842
MaxBudgetPerTargetAudienceBdropM: 108.58

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2027
MaxBudgetPerTargetAudiencePdropM: 98.67

             precision    recall  f1-score   support

          0       0.97      0.73      0.83      1628
          1       0.16      0.67      0.25       119

avg / total       0.91      0.73      0.79      1747

Confusion matrixdropM:
[[1192  436]
 [  39   80]]
[0 1 1 ... 1 0 0]
[LibLinear]==============================================================================================================
LogisticRegression
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 24.01%
Model accuracydropM: 68.12% 
ROC just use this to check overfitting: 

0.7081999876117523
0.6777171086843474
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.57
Brier score loss: 0.32
BenefitItemdropM: 88
BenefitCodropM: 73.95%
CostItemdropM: 614
CostCodropM: 35.15%
ImproveRatio: 210.41%
balancetradeoffradio: 6.98
balancetradeoffradio: 6.98
ProfitratiodropM: 0.11

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1994
MaxBudgetPerTargetAudienceBdropM: 100.30

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2193
MaxBudgetPerTargetAudiencePdropM: 91.20

             precision    recall  f1-score   support

          0       0.97      0.68      0.80      1628
          1       0.14      0.74      0.24       119

avg / total       0.92      0.68      0.76      1747

Confusion matrixdropM:
[[1102  526]
 [  31   88]]
[0 1 1 ... 0 0 1]
[LibSVM]==============================================================================================================
SVC
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 22.85%
Model accuracydropM: 76.42% 
ROC just use this to check overfitting: 

0.6475801622860446
0.7901716068642746
----------------Useful Scores: loss and cost-benefit scores

Log LossdropM: 0.50
Brier score loss: 0.24
BenefitItemdropM: 61
BenefitCodropM: 51.26%
CostItemdropM: 415
CostCodropM: 23.76%
ImproveRatio: 215.79%
balancetradeoffradio: 6.80
balancetradeoffradio: 6.80
ProfitratiodropM: 0.16

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropM: 1942
MaxBudgetPerTargetAudienceBdropM: 102.99

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropM: 2137
MaxBudgetPerTargetAudiencePdropM: 93.59

             precision    recall  f1-score   support

          0       0.96      0.78      0.86      1628
          1       0.15      0.51      0.23       119

avg / total       0.90      0.76      0.82      1747

Confusion matrixdropM:
[[1274  354]
 [  58   61]]
[1 0 1 ... 0 0 0]

4.3.2 Train, test and evaluate the models using KFold: Smote X_TraindropM (here with the name X_Train) ; y_TestdropM (here with the name y_Train)

In [103]:
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
from __future__ import division
from sklearn.model_selection import StratifiedKFold
#Confusion matrix and confusion tables:¶
#The columns represent the actual class and the rows represent the predicted class. Lets evaluate performance:
def draw_confusion_matricesdropMSS(confusion_matriciesdropMSS,class_namesdropMSS):
    class_namesdropMSS = class_namesdropMSS.tolist()
    for cm in confusion_matricesdropMSS:
        classifier, cm = cm[0], cm[1]
        print'Confusion matrixdropM:\n', cm              
        fig = plt.figure()
        ax = fig.add_subplot(111)
        sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells 
        plt.ylabel('True')
        plt.xlabel('Predicted')    
        ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
        plt.show()   


# Logging for Visual Comparison
log_colsUdropMSS = ["Classifier", "F-score","Accuracy"]
logUdropMSS = pd.DataFrame(columns=log_colsUdropMSS)

log_colsdropMSS=["Classifier", "Log Loss","OverfittingRoc","BLossS","BI","BO", "CI", "CO", "IR","BTOR","PR"] 
logdropMSS = pd.DataFrame(columns=log_colsdropMSS)

log_colsCBAdropMSS =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAdropMSS = pd.DataFrame(columns=log_colsCBAdropMSS)


for clf in classifiers:
    skf = StratifiedKFold(n_splits=3,random_state=43,shuffle=True)
    print skf
    skf.get_n_splits(X, y)
    for train_index, test_index in skf.split(X,y):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_Train, X_Test = X.iloc[train_index], X.iloc[test_index]
        y_Train, y_Test = y.iloc[train_index], y.iloc[test_index]
        class_namesdropMSS = np.unique(np.array(y_Test))
        
        X_Train=X_Train.drop(["C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD", 
                                       "C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
                                       "C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
                                       "C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD", 
                                       "C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC", 
                                       "C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
                                       "C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
                                       "C42MINKGEM", "C65AWAPART", 
                                       "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT", 
                                       "C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
                                       "C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
                                       "C84AINBOED", "C85ABYSTAND"], axis=1)
        
        X_Test =X_Test.drop(["C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD", 
                                       "C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
                                       "C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
                                       "C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD", 
                                       "C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC", 
                                       "C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
                                       "C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
                                       "C42MINKGEM", "C65AWAPART", 
                                       "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT", 
                                       "C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
                                       "C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
                                       "C84AINBOED", "C85ABYSTAND"], axis=1)

            # Apply regular SMOTE
        sm = SMOTE(kind='regular')
        X_trainSS, y_trainSS = sm.fit_sample(X_Train, y_Train)
        print('Training Set Shape after oversampling:   ', X_trainSS.shape, y_trainSS.shape)
        print(pd.crosstab(y_trainSS,y_trainSS))

        #print("TRAIN:", X_Train)
    #cv = cross_validation.StratifiedKFold(y_TraindropM, n_folds=3, random_state=42)
    #test_predictionsdropM = cross_validation.cross_val_predict(clf, X=X_TraindropM, y=y_TraindropM, n_jobs=-1, cv=cv)
        clf.fit(X_trainSS, y_trainSS)
        namedropMSS = clf.__class__.__name__
    
        print("="*110)
        print(namedropMSS)
        print(str(clf));print('\n')
    
        print('****************ResultsdropM****************')
        print('\n----------------Unhelpful Scores\n')

        test_predictionsSS = clf.predict(X_Test)
        
        accdropMSS = f1_score(y_Test, clf.predict(X_Test))
        print("F-scoredropM: {:.2%}".format(accdropMSS))
        
        test_predictionsSS = clf.predict(X_Test)
        acc2dropMSS = accuracy_score(y_Test, test_predictionsSS)

        print('Model accuracySS: {:.2%} '.format(acc2dropMSS))
     
    
        print('ROC just use this to check overfitting: \n')
#If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
        RocScoreSS=roc_auc_score(y_Test, test_predictionsSS)
        fprBSS, tprBSS, thresholdsBSS = roc_curve(y_Test, clf.predict_proba(X_Test)[:,1])
        RocScoreTrainSS=roc_auc_score(y_Train, clf.predict(X_Train))
        fprBTrainSS, tprBTrainSS, thresholdsBTrainSS = roc_curve(y_trainSS, clf.predict_proba(X_trainSS)[:,1])
        
        OverfittingRoc=float(format(RocScoreTrainSS-RocScoreSS,'.2f'))
        
        print RocScoreSS
        print RocScoreTrainSS
        plt.figure()
        plt.plot(fprBSS, tprBSS, label='classifiersTest' % RocScoreSS)
        plt.plot(fprBTrainSS, tprBTrainSS, label='classifiersTrain' % RocScoreTrainSS)
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.savefig('Log_ROC')
        plt.show()
    
        print('\n----------------Useful Scores: loss and cost-benefit scores\n')
        test_predictionsSS = clf.predict_proba(X_Test)
        lldropMSS = log_loss(y_Test, test_predictionsSS)
        print("Log LossdropMSS: {:.2f}".format(lldropMSS))
        
        test_predictionsSS = clf.predict(X_Test)
    
    # This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
        BLossS= brier_score_loss(y_Test, test_predictionsSS)
        print("Brier score loss: {:.2f}".format(BLossS))
    
#    test_predictionsdropM = clf.predict(X_TestdropM)
        confusiondropMSS = metrics.confusion_matrix(y_Test, test_predictionsSS)
        TPdropMSS = confusiondropMSS[0, 0]
        TNdropMSS = confusiondropMSS[1, 1]
        FPdropMSS = confusiondropMSS[1, 0]
        FNdropMSS = confusiondropMSS[0, 1]
        BenefitItemdropMSS = TNdropMSS
        BenefitCodropMSS = TNdropMSS / (TNdropMSS + FPdropMSS) # this is specificity in statistics 
        print("BenefitItemdropMSS: {}".format(BenefitItemdropMSS))
        print("BenefitCodropMSS: {:.2%}".format(BenefitCodropMSS))
    
        CostItemdropMSS = (TNdropMSS + FNdropMSS)
        CostCodropMSS = (TNdropMSS + FNdropMSS) / (TPdropMSS + TNdropMSS + FPdropMSS +FNdropMSS) 
          
        print("CostItemdropMSS: {}".format(CostItemdropMSS))
        print("CostCodropMSS: {:.2%}".format(CostCodropMSS))
        
        ImproveRatiodropMSS = (BenefitItemdropMSS/CostItemdropMSS) /((TNdropMSS+FPdropMSS) /(TPdropMSS + TNdropMSS + FPdropMSS+FNdropMSS))    
        print("ImproveRatio: {:.2%}".format(ImproveRatiodropMSS))
    
        #scenario BenefitItem*price-CostItem*
        balancetradeoffradiodropMSS = float(format(CostItemdropMSS/BenefitItemdropMSS , '.2f'))
        print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropMSS)+CEND)
        ProfitratiodropMSS = float(format(BenefitItemdropMSS/CostItemdropMSS, '.2f'))
        print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropMSS)+CEND)
        print(CBLUE+"ProfitratiodropM: {0:.2f}".format(ProfitratiodropMSS)+CEND)
    
        print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
        #   #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent 
        Budget = 200000
        #One scenario considers benefitgoal, one scenario considers the balance, no deficit
        ProfitGoal= 20000
        # Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management 
        ProfitPerBenefitItem = 700

        print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n')
        # within this budget, the smallest amount of  target audience should be reached, in order to keep the balance of the account 
        # Min CostItem
        MinTargetAudienceBdropMSS= int((Budget/ProfitPerBenefitItem)*balancetradeoffradiodropMSS)
    
        #For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
        MaxBudgetPerTargetAudienceBdropMSS = float(format(Budget/MinTargetAudienceBdropMSS, '.2f'))
    
        #To reach the benifit goal, at least MinTargetAudienceP should be reached 
        MinTargetAudiencePdropMSS = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradiodropMSS)
    
        #To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent 
        MaxBudgetPerTargetAudiencePdropMSS = float(format(Budget/MinTargetAudiencePdropMSS, '.2f'))
    
    
        print("MinTargetAudienceBdropMSS: {}".format(MinTargetAudienceBdropMSS))
        print("MaxBudgetPerTargetAudienceBdropMSS: {0:.2f}".format(MaxBudgetPerTargetAudienceBdropMSS)) 
        print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
        print("MinTargetAudiencePdropMSS: {}".format(MinTargetAudiencePdropMSS))
        print("MaxBudgetPerTargetAudiencePdropMSS: {0:.2f}".format(MaxBudgetPerTargetAudiencePdropMSS)+"\n")

        log_entryUdropMSS = pd.DataFrame([[namedropMSS, accdropMSS,acc2dropMSS]], columns=log_colsUdropMSS)#FPR*100,
        logUdropMSS = logUdropMSS.append(log_entryUdropMSS,ignore_index=True)
    
        log_entrydropMSS = pd.DataFrame([[namedropMSS,lldropMSS,OverfittingRoc,BLossS,BenefitItemdropMSS,BenefitCodropMSS, CostItemdropMSS, CostCodropMSS,ImproveRatiodropMSS,balancetradeoffradiodropMSS,ProfitratiodropMSS]], columns=log_colsdropMSS)#FPR*100,
        logdropMSS = logdropMSS.append(log_entrydropMSS,ignore_index=True)
        
    
        log_entryCBAdropMSS = pd.DataFrame([[namedropMSS,MinTargetAudienceBdropMSS,MaxBudgetPerTargetAudienceBdropMSS,MinTargetAudiencePdropMSS,MaxBudgetPerTargetAudiencePdropMSS]], columns=log_colsCBAdropMSS)
        logCBAdropMSS = logCBAdropMSS.append(log_entryCBAdropMSS,ignore_index=True)
   
        reportdropMSS = classification_report(y_Test, test_predictionsSS)
        print(reportdropMSS)
        confusion_matricesdropMSS = [
        ( "", confusion_matrix(y_Test, test_predictionsSS))
        ]
        draw_confusion_matricesdropMSS(confusion_matricesdropMSS,class_namesdropMSS)
        predictions = clf.predict(OutputdropM2)
        print predictions
    #new_column = df['Classifiers'] 
#    pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
        with open('C:\Users\chenp\Desktop\output.KFold.4.3.2.csv', 'a') as csvfile:#, newline=''
                    fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#, 
                    fwriter.writerow(predictions)
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
KNeighborsClassifier
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.88%
Model accuracySS: 89.70% 
ROC just use this to check overfitting: 

0.5900047236655644
0.69369324141711
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 1.95
Brier score loss: 0.10
BenefitItemdropMSS: 28
BenefitCodropMSS: 24.14%
CostItemdropMSS: 140
CostCodropMSS: 7.21%
ImproveRatio: 334.66%
balancetradeoffradio: 5.00
balancetradeoffradio: 5.00
ProfitratiodropM: 0.20

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1428
MaxBudgetPerTargetAudienceBdropMSS: 140.06

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 1571
MaxBudgetPerTargetAudiencePdropMSS: 127.31

             precision    recall  f1-score   support

          0       0.95      0.94      0.94      1825
          1       0.20      0.24      0.22       116

avg / total       0.91      0.90      0.90      1941

Confusion matrixdropM:
[[1713  112]
 [  88   28]]
[0 0 1 ... 0 0 0]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
KNeighborsClassifier
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 13.28%
Model accuracySS: 88.56% 
ROC just use this to check overfitting: 

0.5395772319319793
0.6721161206187808
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 2.28
Brier score loss: 0.11
BenefitItemdropMSS: 17
BenefitCodropMSS: 14.66%
CostItemdropMSS: 140
CostCodropMSS: 7.21%
ImproveRatio: 203.18%
balancetradeoffradio: 8.24
balancetradeoffradio: 8.24
ProfitratiodropM: 0.12

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2354
MaxBudgetPerTargetAudienceBdropMSS: 84.96

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2589
MaxBudgetPerTargetAudiencePdropMSS: 77.25

             precision    recall  f1-score   support

          0       0.95      0.93      0.94      1825
          1       0.12      0.15      0.13       116

avg / total       0.90      0.89      0.89      1941

Confusion matrixdropM:
[[1702  123]
 [  99   17]]
[0 0 1 ... 0 0 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
==============================================================================================================
KNeighborsClassifier
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 10.10%
Model accuracySS: 90.82% 
ROC just use this to check overfitting: 

0.5233666061705989
0.6431176192725555
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 1.88
Brier score loss: 0.09
BenefitItemdropMSS: 10
BenefitCodropMSS: 8.62%
CostItemdropMSS: 82
CostCodropMSS: 4.23%
ImproveRatio: 203.95%
balancetradeoffradio: 8.20
balancetradeoffradio: 8.20
ProfitratiodropM: 0.12

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2342
MaxBudgetPerTargetAudienceBdropMSS: 85.40

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2577
MaxBudgetPerTargetAudiencePdropMSS: 77.61

             precision    recall  f1-score   support

          0       0.94      0.96      0.95      1824
          1       0.12      0.09      0.10       116

avg / total       0.89      0.91      0.90      1940

Confusion matrixdropM:
[[1752   72]
 [ 106   10]]
[0 0 1 ... 0 0 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
AdaBoostClassifier
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.03%
Model accuracySS: 77.95% 
ROC just use this to check overfitting: 

0.6445937647614549
0.68911416448531
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.68
Brier score loss: 0.22
BenefitItemdropMSS: 57
BenefitCodropMSS: 49.14%
CostItemdropMSS: 426
CostCodropMSS: 21.95%
ImproveRatio: 223.89%
balancetradeoffradio: 7.47
balancetradeoffradio: 7.47
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2134
MaxBudgetPerTargetAudienceBdropMSS: 93.72

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2347
MaxBudgetPerTargetAudiencePdropMSS: 85.22

             precision    recall  f1-score   support

          0       0.96      0.80      0.87      1825
          1       0.13      0.49      0.21       116

avg / total       0.91      0.78      0.83      1941

Confusion matrixdropM:
[[1456  369]
 [  59   57]]
[1 0 0 ... 1 0 0]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
AdaBoostClassifier
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.02%
Model accuracySS: 71.35% 
ROC just use this to check overfitting: 

0.6781435994331602
0.6801603651449145
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.68
Brier score loss: 0.29
BenefitItemdropMSS: 74
BenefitCodropMSS: 63.79%
CostItemdropMSS: 588
CostCodropMSS: 30.29%
ImproveRatio: 210.58%
balancetradeoffradio: 7.95
balancetradeoffradio: 7.95
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2271
MaxBudgetPerTargetAudienceBdropMSS: 88.07

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2498
MaxBudgetPerTargetAudiencePdropMSS: 80.06

             precision    recall  f1-score   support

          0       0.97      0.72      0.83      1825
          1       0.13      0.64      0.21       116

avg / total       0.92      0.71      0.79      1941

Confusion matrixdropM:
[[1311  514]
 [  42   74]]
[0 0 1 ... 0 1 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
==============================================================================================================
AdaBoostClassifier
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.86%
Model accuracySS: 69.64% 
ROC just use this to check overfitting: 

0.6649841197822142
0.7048181388757676
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.68
Brier score loss: 0.30
BenefitItemdropMSS: 73
BenefitCodropMSS: 62.93%
CostItemdropMSS: 619
CostCodropMSS: 31.91%
ImproveRatio: 197.23%
balancetradeoffradio: 8.48
balancetradeoffradio: 8.48
ProfitratiodropM: 0.12

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2422
MaxBudgetPerTargetAudienceBdropMSS: 82.58

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2665
MaxBudgetPerTargetAudiencePdropMSS: 75.05

             precision    recall  f1-score   support

          0       0.97      0.70      0.81      1824
          1       0.12      0.63      0.20       116

avg / total       0.92      0.70      0.78      1940

Confusion matrixdropM:
[[1278  546]
 [  43   73]]
[0 0 1 ... 0 1 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
GradientBoostingClassifier
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 20.16%
Model accuracySS: 84.08% 
ROC just use this to check overfitting: 

0.6045418044402455
0.750676850530613
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.35
Brier score loss: 0.16
BenefitItemdropMSS: 39
BenefitCodropMSS: 33.62%
CostItemdropMSS: 271
CostCodropMSS: 13.96%
ImproveRatio: 240.80%
balancetradeoffradio: 6.95
balancetradeoffradio: 6.95
ProfitratiodropM: 0.14

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1985
MaxBudgetPerTargetAudienceBdropMSS: 100.76

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2184
MaxBudgetPerTargetAudiencePdropMSS: 91.58

             precision    recall  f1-score   support

          0       0.95      0.87      0.91      1825
          1       0.14      0.34      0.20       116

avg / total       0.91      0.84      0.87      1941

Confusion matrixdropM:
[[1593  232]
 [  77   39]]
[0 0 1 ... 0 0 0]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
GradientBoostingClassifier
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 23.24%
Model accuracySS: 85.37% 
ROC just use this to check overfitting: 

0.6275366084081246
0.725188644030958
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.38
Brier score loss: 0.15
BenefitItemdropMSS: 43
BenefitCodropMSS: 37.07%
CostItemdropMSS: 254
CostCodropMSS: 13.09%
ImproveRatio: 283.27%
balancetradeoffradio: 5.91
balancetradeoffradio: 5.91
ProfitratiodropM: 0.17

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1688
MaxBudgetPerTargetAudienceBdropMSS: 118.48

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 1857
MaxBudgetPerTargetAudiencePdropMSS: 107.70

             precision    recall  f1-score   support

          0       0.96      0.88      0.92      1825
          1       0.17      0.37      0.23       116

avg / total       0.91      0.85      0.88      1941

Confusion matrixdropM:
[[1614  211]
 [  73   43]]
[0 0 0 ... 1 0 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
==============================================================================================================
GradientBoostingClassifier
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 18.29%
Model accuracySS: 85.26% 
ROC just use this to check overfitting: 

0.5825582274652148
0.7400625885687293
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.47
Brier score loss: 0.15
BenefitItemdropMSS: 32
BenefitCodropMSS: 27.59%
CostItemdropMSS: 234
CostCodropMSS: 12.06%
ImproveRatio: 228.71%
balancetradeoffradio: 7.31
balancetradeoffradio: 7.31
ProfitratiodropM: 0.14

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2088
MaxBudgetPerTargetAudienceBdropMSS: 95.79

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2297
MaxBudgetPerTargetAudiencePdropMSS: 87.07

             precision    recall  f1-score   support

          0       0.95      0.89      0.92      1824
          1       0.14      0.28      0.18       116

avg / total       0.90      0.85      0.87      1940

Confusion matrixdropM:
[[1622  202]
 [  84   32]]
[0 0 0 ... 0 0 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
LGBMClassifier
LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.631179404427, importance_type='split',
        learning_rate=0.0278025184912, max_depth=-1, metric='auc',
        min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=50, reg_alpha=0.0618311835591,
        reg_lambda=0.247428314075, silent=True, subsample=0.999742610272,
        subsample_for_bin=280000, subsample_freq=1, verbose=1)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.00%
Model accuracySS: 76.35% 
ROC just use this to check overfitting: 

0.6522461029759093
0.7073401073510929
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.48
Brier score loss: 0.24
BenefitItemdropMSS: 61
BenefitCodropMSS: 52.59%
CostItemdropMSS: 465
CostCodropMSS: 23.96%
ImproveRatio: 219.51%
balancetradeoffradio: 7.62
balancetradeoffradio: 7.62
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2177
MaxBudgetPerTargetAudienceBdropMSS: 91.87

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2394
MaxBudgetPerTargetAudiencePdropMSS: 83.54

             precision    recall  f1-score   support

          0       0.96      0.78      0.86      1825
          1       0.13      0.53      0.21       116

avg / total       0.91      0.76      0.82      1941

Confusion matrixdropM:
[[1421  404]
 [  55   61]]
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
[1 1 1 ... 1 0 1]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
LGBMClassifier
LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.631179404427, importance_type='split',
        learning_rate=0.0278025184912, max_depth=-1, metric='auc',
        min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=50, reg_alpha=0.0618311835591,
        reg_lambda=0.247428314075, silent=True, subsample=0.999742610272,
        subsample_for_bin=280000, subsample_freq=1, verbose=1)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.35%
Model accuracySS: 72.28% 
ROC just use this to check overfitting: 

0.6790387340576287
0.7135841420889993
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.50
Brier score loss: 0.28
BenefitItemdropMSS: 73
BenefitCodropMSS: 62.93%
CostItemdropMSS: 568
CostCodropMSS: 29.26%
ImproveRatio: 215.05%
balancetradeoffradio: 7.78
balancetradeoffradio: 7.78
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2222
MaxBudgetPerTargetAudienceBdropMSS: 90.01

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2445
MaxBudgetPerTargetAudiencePdropMSS: 81.80

             precision    recall  f1-score   support

          0       0.97      0.73      0.83      1825
          1       0.13      0.63      0.21       116

avg / total       0.92      0.72      0.79      1941

Confusion matrixdropM:
[[1330  495]
 [  43   73]]
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
[0 1 1 ... 1 1 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
==============================================================================================================
LGBMClassifier
LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.631179404427, importance_type='split',
        learning_rate=0.0278025184912, max_depth=-1, metric='auc',
        min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=50, reg_alpha=0.0618311835591,
        reg_lambda=0.247428314075, silent=True, subsample=0.999742610272,
        subsample_for_bin=280000, subsample_freq=1, verbose=1)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.15%
Model accuracySS: 73.09% 
ROC just use this to check overfitting: 

0.6712416817906837
0.7137045347189419
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.47
Brier score loss: 0.27
BenefitItemdropMSS: 70
BenefitCodropMSS: 60.34%
CostItemdropMSS: 546
CostCodropMSS: 28.14%
ImproveRatio: 214.41%
balancetradeoffradio: 7.80
balancetradeoffradio: 7.80
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2228
MaxBudgetPerTargetAudienceBdropMSS: 89.77

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2451
MaxBudgetPerTargetAudiencePdropMSS: 81.60

             precision    recall  f1-score   support

          0       0.97      0.74      0.84      1824
          1       0.13      0.60      0.21       116

avg / total       0.92      0.73      0.80      1940

Confusion matrixdropM:
[[1348  476]
 [  46   70]]
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
c:\users\chenp\anaconda2\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
[0 1 1 ... 1 1 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
DecisionTreeClassifier
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 20.42%
Model accuracySS: 80.32% 
ROC just use this to check overfitting: 

0.6249055266887105
0.7748084028689957
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 1.12
Brier score loss: 0.20
BenefitItemdropMSS: 49
BenefitCodropMSS: 42.24%
CostItemdropMSS: 364
CostCodropMSS: 18.75%
ImproveRatio: 225.25%
balancetradeoffradio: 7.43
balancetradeoffradio: 7.43
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2122
MaxBudgetPerTargetAudienceBdropMSS: 94.25

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2335
MaxBudgetPerTargetAudiencePdropMSS: 85.65

             precision    recall  f1-score   support

          0       0.96      0.83      0.89      1825
          1       0.13      0.42      0.20       116

avg / total       0.91      0.80      0.85      1941

Confusion matrixdropM:
[[1510  315]
 [  67   49]]
[0 0 1 ... 0 0 0]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
DecisionTreeClassifier
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 18.58%
Model accuracySS: 75.17% 
ROC just use this to check overfitting: 

0.6217264997638167
0.7708364833067161
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 1.15
Brier score loss: 0.25
BenefitItemdropMSS: 55
BenefitCodropMSS: 47.41%
CostItemdropMSS: 476
CostCodropMSS: 24.52%
ImproveRatio: 193.34%
balancetradeoffradio: 8.65
balancetradeoffradio: 8.65
ProfitratiodropM: 0.12

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2471
MaxBudgetPerTargetAudienceBdropMSS: 80.94

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2718
MaxBudgetPerTargetAudiencePdropMSS: 73.58

             precision    recall  f1-score   support

          0       0.96      0.77      0.85      1825
          1       0.12      0.47      0.19       116

avg / total       0.91      0.75      0.81      1941

Confusion matrixdropM:
[[1404  421]
 [  61   55]]
[0 0 1 ... 1 0 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
==============================================================================================================
DecisionTreeClassifier
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.16%
Model accuracySS: 86.08% 
ROC just use this to check overfitting: 

0.5869441923774955
0.7471492678318374
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.93
Brier score loss: 0.14
BenefitItemdropMSS: 32
BenefitCodropMSS: 27.59%
CostItemdropMSS: 218
CostCodropMSS: 11.24%
ImproveRatio: 245.49%
balancetradeoffradio: 6.81
balancetradeoffradio: 6.81
ProfitratiodropM: 0.15

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1945
MaxBudgetPerTargetAudienceBdropMSS: 102.83

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2140
MaxBudgetPerTargetAudiencePdropMSS: 93.46

             precision    recall  f1-score   support

          0       0.95      0.90      0.92      1824
          1       0.15      0.28      0.19       116

avg / total       0.90      0.86      0.88      1940

Confusion matrixdropM:
[[1638  186]
 [  84   32]]
[0 0 0 ... 0 0 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.6s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
RandomForestClassifier
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=1e-07,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
            verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
F-scoredropM: 22.06%
Model accuracySS: 83.26% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
0.6284128483703354
0.7769127819619924
----------------Useful Scores: loss and cost-benefit scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Log LossdropMSS: 0.40
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
Brier score loss: 0.17
BenefitItemdropMSS: 46
BenefitCodropMSS: 39.66%
CostItemdropMSS: 301
CostCodropMSS: 15.51%
ImproveRatio: 255.72%
balancetradeoffradio: 6.54
balancetradeoffradio: 6.54
ProfitratiodropM: 0.15

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1868
MaxBudgetPerTargetAudienceBdropMSS: 107.07

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2055
MaxBudgetPerTargetAudiencePdropMSS: 97.32

             precision    recall  f1-score   support

          0       0.96      0.86      0.91      1825
          1       0.15      0.40      0.22       116

avg / total       0.91      0.83      0.87      1941

Confusion matrixdropM:
[[1570  255]
 [  70   46]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[0 1 1 ... 1 0 0]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
RandomForestClassifier
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=1e-07,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
            verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
F-scoredropM: 26.11%
Model accuracySS: 83.67% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
0.670968351440718
0.7847993309456535
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.41
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Brier score loss: 0.16
BenefitItemdropMSS: 56
BenefitCodropMSS: 48.28%
CostItemdropMSS: 313
CostCodropMSS: 16.13%
ImproveRatio: 299.37%
balancetradeoffradio: 5.59
balancetradeoffradio: 5.59
ProfitratiodropM: 0.18

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1597
MaxBudgetPerTargetAudienceBdropMSS: 125.23

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 1756
MaxBudgetPerTargetAudiencePdropMSS: 113.90

             precision    recall  f1-score   support

          0       0.96      0.86      0.91      1825
          1       0.18      0.48      0.26       116

avg / total       0.92      0.84      0.87      1941

Confusion matrixdropM:
[[1568  257]
 [  60   56]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[0 0 1 ... 1 0 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.7s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
RandomForestClassifier
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=1e-07,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
            verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
F-scoredropM: 22.89%
Model accuracySS: 80.21% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
0.6565997428917121
0.7710829003306566
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
----------------Useful Scores: loss and cost-benefit scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
Log LossdropMSS: 0.40
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
Brier score loss: 0.20
BenefitItemdropMSS: 57
BenefitCodropMSS: 49.14%
CostItemdropMSS: 382
CostCodropMSS: 19.69%
ImproveRatio: 249.55%
balancetradeoffradio: 6.70
balancetradeoffradio: 6.70
ProfitratiodropM: 0.15

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1914
MaxBudgetPerTargetAudienceBdropMSS: 104.49

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2105
MaxBudgetPerTargetAudiencePdropMSS: 95.01

             precision    recall  f1-score   support

          0       0.96      0.82      0.89      1824
          1       0.15      0.49      0.23       116

avg / total       0.91      0.80      0.85      1940

Confusion matrixdropM:
[[1499  325]
 [  59   57]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[0 1 1 ... 1 0 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.8s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
==============================================================================================================
ExtraTreesClassifier
ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample',
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=1e-07,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
           verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
F-scoredropM: 20.94%
Model accuracySS: 80.16% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
0.6321563533301843
0.7802403350941685
----------------Useful Scores: loss and cost-benefit scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Log LossdropMSS: 0.48
Brier score loss: 0.20
BenefitItemdropMSS: 51
BenefitCodropMSS: 43.97%
CostItemdropMSS: 371
CostCodropMSS: 19.11%
ImproveRatio: 230.02%
balancetradeoffradio: 7.27
balancetradeoffradio: 7.27
ProfitratiodropM: 0.14

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2077
MaxBudgetPerTargetAudienceBdropMSS: 96.29

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2284
MaxBudgetPerTargetAudiencePdropMSS: 87.57

             precision    recall  f1-score   support

          0       0.96      0.82      0.89      1825
          1       0.14      0.44      0.21       116

avg / total       0.91      0.80      0.85      1941

Confusion matrixdropM:
[[1505  320]
 [  65   51]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[0 1 1 ... 1 0 0]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.2s finished
==============================================================================================================
ExtraTreesClassifier
ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample',
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=1e-07,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
           verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
F-scoredropM: 26.15%
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Model accuracySS: 79.34% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
0.7085002361832782
0.7712221581727634
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
----------------Useful Scores: loss and cost-benefit scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
Log LossdropMSS: 0.49
Brier score loss: 0.21
BenefitItemdropMSS: 71
BenefitCodropMSS: 61.21%
CostItemdropMSS: 427
CostCodropMSS: 22.00%
ImproveRatio: 278.23%
balancetradeoffradio: 6.01
balancetradeoffradio: 6.01
ProfitratiodropM: 0.17

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1717
MaxBudgetPerTargetAudienceBdropMSS: 116.48

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 1888
MaxBudgetPerTargetAudiencePdropMSS: 105.93

             precision    recall  f1-score   support

          0       0.97      0.80      0.88      1825
          1       0.17      0.61      0.26       116

avg / total       0.92      0.79      0.84      1941

Confusion matrixdropM:
[[1469  356]
 [  45   71]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[0 1 1 ... 1 0 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished
==============================================================================================================
ExtraTreesClassifier
ExtraTreesClassifier(bootstrap=True, class_weight='balanced_subsample',
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=1e-07,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=42,
           verbose=1, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
F-scoredropM: 24.22%
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished
Model accuracySS: 80.00% 
ROC just use this to check overfitting: 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished
0.6756843617664852
0.7726535191308456
----------------Useful Scores: loss and cost-benefit scores

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
Log LossdropMSS: 0.49
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished
Brier score loss: 0.20
BenefitItemdropMSS: 62
BenefitCodropMSS: 53.45%
CostItemdropMSS: 396
CostCodropMSS: 20.41%
ImproveRatio: 261.84%
balancetradeoffradio: 6.39
balancetradeoffradio: 6.39
ProfitratiodropM: 0.16

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 1825
MaxBudgetPerTargetAudienceBdropMSS: 109.59

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2008
MaxBudgetPerTargetAudiencePdropMSS: 99.60

             precision    recall  f1-score   support

          0       0.97      0.82      0.88      1824
          1       0.16      0.53      0.24       116

avg / total       0.92      0.80      0.85      1940

Confusion matrixdropM:
[[1490  334]
 [  54   62]]
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
[0 1 1 ... 1 0 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
LinearDiscriminantAnalysis
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.92%
Model accuracySS: 67.28% 
ROC just use this to check overfitting: 

0.6766816249409541
0.7057501582861625
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.61
Brier score loss: 0.33
BenefitItemdropMSS: 79
BenefitCodropMSS: 68.10%
CostItemdropMSS: 677
CostCodropMSS: 34.88%
ImproveRatio: 195.26%
balancetradeoffradio: 8.57
balancetradeoffradio: 8.57
ProfitratiodropM: 0.12

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2448
MaxBudgetPerTargetAudienceBdropMSS: 81.70

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2693
MaxBudgetPerTargetAudiencePdropMSS: 74.27

             precision    recall  f1-score   support

          0       0.97      0.67      0.79      1825
          1       0.12      0.68      0.20       116

avg / total       0.92      0.67      0.76      1941

Confusion matrixdropM:
[[1227  598]
 [  37   79]]
[0 1 1 ... 1 0 1]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
LinearDiscriminantAnalysis
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 18.66%
Model accuracySS: 61.82% 
ROC just use this to check overfitting: 

0.6718587623996222
0.6810858666994264
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.61
Brier score loss: 0.38
BenefitItemdropMSS: 85
BenefitCodropMSS: 73.28%
CostItemdropMSS: 795
CostCodropMSS: 40.96%
ImproveRatio: 178.90%
balancetradeoffradio: 9.35
balancetradeoffradio: 9.35
ProfitratiodropM: 0.11

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2671
MaxBudgetPerTargetAudienceBdropMSS: 74.88

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2938
MaxBudgetPerTargetAudiencePdropMSS: 68.07

             precision    recall  f1-score   support

          0       0.97      0.61      0.75      1825
          1       0.11      0.73      0.19       116

avg / total       0.92      0.62      0.72      1941

Confusion matrixdropM:
[[1115  710]
 [  31   85]]
[0 1 1 ... 0 1 1]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
==============================================================================================================
LinearDiscriminantAnalysis
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.55%
Model accuracySS: 66.91% 
ROC just use this to check overfitting: 

0.6706367211131277
0.6996941426547
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.60
Brier score loss: 0.33
BenefitItemdropMSS: 78
BenefitCodropMSS: 67.24%
CostItemdropMSS: 682
CostCodropMSS: 35.15%
ImproveRatio: 191.27%
balancetradeoffradio: 8.74
balancetradeoffradio: 8.74
ProfitratiodropM: 0.11

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2497
MaxBudgetPerTargetAudienceBdropMSS: 80.10

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2746
MaxBudgetPerTargetAudiencePdropMSS: 72.83

             precision    recall  f1-score   support

          0       0.97      0.67      0.79      1824
          1       0.11      0.67      0.20       116

avg / total       0.92      0.67      0.76      1940

Confusion matrixdropM:
[[1220  604]
 [  38   78]]
[0 1 1 ... 0 0 1]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
BernoulliNB
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.33%
Model accuracySS: 71.87% 
ROC just use this to check overfitting: 

0.6808833254605575
0.7012384120354184
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.64
Brier score loss: 0.28
BenefitItemdropMSS: 74
BenefitCodropMSS: 63.79%
CostItemdropMSS: 578
CostCodropMSS: 29.78%
ImproveRatio: 214.23%
balancetradeoffradio: 7.81
balancetradeoffradio: 7.81
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2231
MaxBudgetPerTargetAudienceBdropMSS: 89.65

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2454
MaxBudgetPerTargetAudiencePdropMSS: 81.50

             precision    recall  f1-score   support

          0       0.97      0.72      0.83      1825
          1       0.13      0.64      0.21       116

avg / total       0.92      0.72      0.79      1941

Confusion matrixdropM:
[[1321  504]
 [  42   74]]
[0 1 1 ... 1 0 0]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
==============================================================================================================
BernoulliNB
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.07%
Model accuracySS: 70.27% 
ROC just use this to check overfitting: 

0.6844992914501654
0.6967166252445167
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.63
Brier score loss: 0.30
BenefitItemdropMSS: 77
BenefitCodropMSS: 66.38%
CostItemdropMSS: 615
CostCodropMSS: 31.68%
ImproveRatio: 209.50%
balancetradeoffradio: 7.99
balancetradeoffradio: 7.99
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2282
MaxBudgetPerTargetAudienceBdropMSS: 87.64

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2511
MaxBudgetPerTargetAudiencePdropMSS: 79.65

             precision    recall  f1-score   support

          0       0.97      0.71      0.82      1825
          1       0.13      0.66      0.21       116

avg / total       0.92      0.70      0.78      1941

Confusion matrixdropM:
[[1287  538]
 [  39   77]]
[0 1 1 ... 1 0 0]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
==============================================================================================================
BernoulliNB
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 21.60%
Model accuracySS: 73.81% 
ROC just use this to check overfitting: 

0.6750794010889293
0.6986726499763818
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.62
Brier score loss: 0.26
BenefitItemdropMSS: 70
BenefitCodropMSS: 60.34%
CostItemdropMSS: 532
CostCodropMSS: 27.42%
ImproveRatio: 220.05%
balancetradeoffradio: 7.60
balancetradeoffradio: 7.60
ProfitratiodropM: 0.13

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2171
MaxBudgetPerTargetAudienceBdropMSS: 92.12

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2388
MaxBudgetPerTargetAudiencePdropMSS: 83.75

             precision    recall  f1-score   support

          0       0.97      0.75      0.84      1824
          1       0.13      0.60      0.22       116

avg / total       0.92      0.74      0.81      1940

Confusion matrixdropM:
[[1362  462]
 [  46   70]]
[0 1 1 ... 1 0 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[LibLinear]==============================================================================================================
LogisticRegression
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.73%
Model accuracySS: 66.87% 
ROC just use this to check overfitting: 

0.6744898441190363
0.7053018776991334
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.60
Brier score loss: 0.33
BenefitItemdropMSS: 79
BenefitCodropMSS: 68.10%
CostItemdropMSS: 685
CostCodropMSS: 35.29%
ImproveRatio: 192.98%
balancetradeoffradio: 8.67
balancetradeoffradio: 8.67
ProfitratiodropM: 0.12

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2477
MaxBudgetPerTargetAudienceBdropMSS: 80.74

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2724
MaxBudgetPerTargetAudiencePdropMSS: 73.42

             precision    recall  f1-score   support

          0       0.97      0.67      0.79      1825
          1       0.12      0.68      0.20       116

avg / total       0.92      0.67      0.76      1941

Confusion matrixdropM:
[[1219  606]
 [  37   79]]
[0 1 1 ... 1 0 1]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[LibLinear]==============================================================================================================
LogisticRegression
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.41%
Model accuracySS: 64.91% 
ROC just use this to check overfitting: 

0.6761880018894663
0.6940777350431389
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.61
Brier score loss: 0.35
BenefitItemdropMSS: 82
BenefitCodropMSS: 70.69%
CostItemdropMSS: 729
CostCodropMSS: 37.56%
ImproveRatio: 188.21%
balancetradeoffradio: 8.89
balancetradeoffradio: 8.89
ProfitratiodropM: 0.11

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2540
MaxBudgetPerTargetAudienceBdropMSS: 78.74

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2794
MaxBudgetPerTargetAudiencePdropMSS: 71.58

             precision    recall  f1-score   support

          0       0.97      0.65      0.78      1825
          1       0.11      0.71      0.19       116

avg / total       0.92      0.65      0.74      1941

Confusion matrixdropM:
[[1178  647]
 [  34   82]]
[0 1 1 ... 0 0 1]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
[LibLinear]==============================================================================================================
LogisticRegression
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 19.54%
Model accuracySS: 67.32% 
ROC just use this to check overfitting: 

0.6687934815486993
0.6990824279641003
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.59
Brier score loss: 0.33
BenefitItemdropMSS: 77
BenefitCodropMSS: 66.38%
CostItemdropMSS: 672
CostCodropMSS: 34.64%
ImproveRatio: 191.63%
balancetradeoffradio: 8.73
balancetradeoffradio: 8.73
ProfitratiodropM: 0.11

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2494
MaxBudgetPerTargetAudienceBdropMSS: 80.19

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2743
MaxBudgetPerTargetAudiencePdropMSS: 72.91

             precision    recall  f1-score   support

          0       0.97      0.67      0.79      1824
          1       0.11      0.66      0.20       116

avg / total       0.92      0.67      0.76      1940

Confusion matrixdropM:
[[1229  595]
 [  39   77]]
[0 1 1 ... 0 0 0]
StratifiedKFold(n_splits=3, random_state=43, shuffle=True)
('TRAIN:', array([   1,    2,    3, ..., 5817, 5819, 5820]), 'TEST:', array([   0,    5,   10, ..., 5816, 5818, 5821]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[LibSVM]==============================================================================================================
SVC
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 18.81%
Model accuracySS: 74.65% 
ROC just use this to check overfitting: 

0.6270595181861124
0.7842665916973001
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.54
Brier score loss: 0.25
BenefitItemdropMSS: 57
BenefitCodropMSS: 49.14%
CostItemdropMSS: 490
CostCodropMSS: 25.24%
ImproveRatio: 194.65%
balancetradeoffradio: 8.60
balancetradeoffradio: 8.60
ProfitratiodropM: 0.12

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2457
MaxBudgetPerTargetAudienceBdropMSS: 81.40

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2702
MaxBudgetPerTargetAudiencePdropMSS: 74.02

             precision    recall  f1-score   support

          0       0.96      0.76      0.85      1825
          1       0.12      0.49      0.19       116

avg / total       0.91      0.75      0.81      1941

Confusion matrixdropM:
[[1392  433]
 [  59   57]]
[1 0 1 ... 1 0 1]
('TRAIN:', array([   0,    1,    4, ..., 5819, 5820, 5821]), 'TEST:', array([   2,    3,    7, ..., 5792, 5802, 5813]))
('Training Set Shape after oversampling:   ', (7298L, 22L), (7298L,))
col_0     0     1
row_0            
0      3649     0
1         0  3649
[LibSVM]==============================================================================================================
SVC
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 22.69%
Model accuracySS: 73.67% 
ROC just use this to check overfitting: 

0.6945087387812943
0.7860986949660276
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.51
Brier score loss: 0.26
BenefitItemdropMSS: 75
BenefitCodropMSS: 64.66%
CostItemdropMSS: 545
CostCodropMSS: 28.08%
ImproveRatio: 230.27%
balancetradeoffradio: 7.27
balancetradeoffradio: 7.27
ProfitratiodropM: 0.14

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2077
MaxBudgetPerTargetAudienceBdropMSS: 96.29

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2284
MaxBudgetPerTargetAudiencePdropMSS: 87.57

             precision    recall  f1-score   support

          0       0.97      0.74      0.84      1825
          1       0.14      0.65      0.23       116

avg / total       0.92      0.74      0.80      1941

Confusion matrixdropM:
[[1355  470]
 [  41   75]]
[1 0 1 ... 1 0 1]
('TRAIN:', array([   0,    2,    3, ..., 5816, 5818, 5821]), 'TEST:', array([   1,    4,    6, ..., 5817, 5819, 5820]))
('Training Set Shape after oversampling:   ', (7300L, 22L), (7300L,))
col_0     0     1
row_0            
0      3650     0
1         0  3650
[LibSVM]==============================================================================================================
SVC
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=True)


****************ResultsdropM****************

----------------Unhelpful Scores

F-scoredropM: 17.54%
Model accuracySS: 70.93% 
ROC just use this to check overfitting: 

0.6193663036902601
0.7653188474256023
----------------Useful Scores: loss and cost-benefit scores

Log LossdropMSS: 0.53
Brier score loss: 0.29
BenefitItemdropMSS: 60
BenefitCodropMSS: 51.72%
CostItemdropMSS: 568
CostCodropMSS: 29.28%
ImproveRatio: 176.66%
balancetradeoffradio: 9.47
balancetradeoffradio: 9.47
ProfitratiodropM: 0.11

----------------Scenario analysis shreshold: marketing and controlling strategies

Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit, then the insurance company should be:


MinTargetAudienceBdropMSS: 2705
MaxBudgetPerTargetAudienceBdropMSS: 73.94

In order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:
MinTargetAudiencePdropMSS: 2976
MaxBudgetPerTargetAudiencePdropMSS: 67.20

             precision    recall  f1-score   support

          0       0.96      0.72      0.82      1824
          1       0.11      0.52      0.18       116

avg / total       0.91      0.71      0.78      1940

Confusion matrixdropM:
[[1316  508]
 [  56   60]]
[0 0 1 ... 0 1 0]

5. Model Selection

  • Model Selection for "4.2 with Smote X_TraindropM2", y_TraindropM2 liminate KNeighborsClassifier,GradientBoostingClassifier,QuadraticDiscriminantAnalysis,GaussianNB

5.1 Evaluate the models with the datasets: Smote X_TraindropM; y_TraindropM

  • Model Selection for "4.1 with Smote X_TraindropM, y_TraindropM" Eliminate GaussianNB,QuadraticDiscriminantAnalysis,KNeighborsClassifier, because the Loss of these three models are higher than 2.

5.1.1 Compare the unhelful scores

In [30]:
#def color_negative_red(lldropM2):
#    """
#    Takes a scalar and returns a string with
#    the css property `'color: red'` for negative
#    strings, black otherwise.
#    """
#    color = 'red' if lldropM2 > 0.7 else 'black'
#    return 'color: %s' % color
In [88]:
logUdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Out[88]:
Classifier F-score Accuracy
0 KNeighborsClassifier 0.159664 0.885518
1 AdaBoostClassifier 0.258824 0.783629
2 GradientBoostingClassifier 0.188034 0.891242
3 LGBMClassifier 0.236607 0.804236
4 DecisionTreeClassifier 0.210526 0.819691
5 RandomForestClassifier 0.271955 0.852891
6 ExtraTreesClassifier 0.278867 0.810532
7 LinearDiscriminantAnalysis 0.237681 0.698912
8 BernoulliNB 0.256 0.733829
9 LogisticRegression 0.22975 0.70063
10 SVC 0.243636 0.761878

5.1.2 Compare the Helpful Scores

In [89]:
logdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Out[89]:
Classifier Log Loss OverfittingRoc BLoss BI BO CI CO IR BTOR PR
0 KNeighborsClassifier 2.00838 -0.27 0.114482 19 0.159664 119 0.0681168 2.34397 6.26 0.16
1 AdaBoostClassifier 0.678296 -0.11 0.216371 66 0.554622 391 0.223812 2.47807 5.92 0.17
2 GradientBoostingClassifier 0.367894 -0.34 0.108758 22 0.184874 115 0.0658271 2.80848 5.23 0.19
3 LGBMClassifier 0.431159 -0.19 0.195764 53 0.445378 329 0.188323 2.36497 6.21 0.16
4 DecisionTreeClassifier 0.974821 -0.28 0.180309 42 0.352941 280 0.160275 2.2021 6.67 0.15
5 RandomForestClassifier 0.383399 -0.22 0.147109 48 0.403361 234 0.133944 3.01142 4.88 0.21
6 ExtraTreesClassifier 0.468264 -0.12 0.189468 64 0.537815 340 0.194619 2.76342 5.31 0.19
7 LinearDiscriminantAnalysis 0.573326 -0.01 0.301088 82 0.689076 571 0.326846 2.10826 6.96 0.14
8 BernoulliNB 0.705319 0.01 0.266171 80 0.672269 506 0.289639 2.32105 6.33 0.16
9 LogisticRegression 0.569373 -0.02 0.29937 78 0.655462 560 0.32055 2.04481 7.18 0.14
10 SVC 0.489028 -0.1 0.238122 67 0.563025 431 0.246709 2.28215 6.43 0.16

5.1.3 Compare the scenario KPIs

In [105]:
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')

logCBAdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:


Out[105]:
Classifier MTAB MBPTAB MTAP MBPTAP
0 KNeighborsClassifier 1788 111.86 1967 101.68
1 AdaBoostClassifier 1691 118.27 1860 107.53
2 GradientBoostingClassifier 1494 133.87 1643 121.73
3 LGBMClassifier 1774 112.74 1951 102.51
4 DecisionTreeClassifier 1905 104.99 2096 95.42
5 RandomForestClassifier 1394 143.47 1533 130.46
6 ExtraTreesClassifier 1517 131.84 1668 119.9
7 LinearDiscriminantAnalysis 1988 100.6 2187 91.45
8 BernoulliNB 1808 110.62 1989 100.55
9 LogisticRegression 2051 97.51 2256 88.65
10 SVC 1837 108.87 2020 99.01

5.2 Evaluate the models with the datasets: Smote X_TrainselecdropM; y_TrainselecdropM

  • Model Selection for "4.2 with Smote X_TrainselecdropM", y_TrainselecdropM
  • RandomForestClassifeir, ExtraTreesClassifier, LogisticRegression and SVC performs good
  • The overfitting problem is enlighted a lot in this model clusters

5.2.1 Compare the unhelful scores

In [91]:
logselecUdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Out[91]:
Classifier F-score Accuracy
0 KNeighborsClassifier 0.03125 0.929021
1 AdaBoostClassifier 0 0.931883
2 GradientBoostingClassifier 0.144444 0.911849
3 LGBMClassifier 0 0.931883
4 DecisionTreeClassifier 0.0458015 0.928449
5 RandomForestClassifier 0.284289 0.835718
6 ExtraTreesClassifier 0.287918 0.841442
7 LinearDiscriminantAnalysis 0.046875 0.930166
8 BernoulliNB 0 0.931883
9 LogisticRegression 0.240929 0.70063
10 SVC 0.269006 0.785346

5.2.2 Compare the helpful score

In [92]:
logselecdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Out[92]:
Classifier Log Loss OverfittingRoc BLoss BI BO CI CO IR BTOR PR
0 KNeighborsClassifier 1.64725 -0.03 0.0709788 2 0.0168067 9 0.00515169 3.26237 4.5 0.22
1 AdaBoostClassifier 0.66076 -0.01 0.0681168 0 0 0 0 0 0 0
2 GradientBoostingClassifier 1.06875 -0.07 0.0881511 13 0.109244 61 0.034917 3.12867 4.69 0.21
3 LGBMClassifier 0.222669 0 0.0681168 0 0 0 0 0 0 0
4 DecisionTreeClassifier 0.784928 -0.05 0.0715512 3 0.0252101 12 0.00686892 3.67017 4 0.25
5 RandomForestClassifier 0.447733 -0.11 0.164282 57 0.478992 282 0.16142 2.96737 4.95 0.2
6 ExtraTreesClassifier 0.423061 -0.11 0.158558 56 0.470588 270 0.154551 3.04488 4.82 0.21
7 LinearDiscriminantAnalysis 0.230387 -0.01 0.069834 3 0.0252101 9 0.00515169 4.89356 3 0.33
8 BernoulliNB 0.22986 -0 0.0681168 0 0 0 0 0 0 0
9 LogisticRegression 0.568247 -0 0.29937 83 0.697479 570 0.326274 2.13771 6.87 0.15
10 SVC 0.230827 -0.07 0.214654 69 0.579832 394 0.225529 2.57098 5.71 0.18

5.2.3 Compare the scenario KPIs

In [106]:
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')

logCBAselecdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:


Out[106]:
Classifier MTAB MBPTAB MTAP MBPTAP
0 KNeighborsClassifier 1285 155.64 1414 141.44
1 AdaBoostClassifier 0 0 0 0
2 GradientBoostingClassifier 1340 149.25 1474 135.69
3 LGBMClassifier 0 0 0 0
4 DecisionTreeClassifier 1142 175.13 1257 159.11
5 RandomForestClassifier 1414 141.44 1555 128.62
6 ExtraTreesClassifier 1377 145.24 1514 132.1
7 LinearDiscriminantAnalysis 857 233.37 942 212.31
8 BernoulliNB 0 0 0 0
9 LogisticRegression 1962 101.94 2159 92.64
10 SVC 1631 122.62 1794 111.48

5.3 Evaluate the models with the datasets using Train_test_split: Smote X_TraindropM2; y_TraincdropM2

5.3.1 Compare the unhelpful scores

In [111]:
logUdropM2.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Out[111]:
Classifier F-score Accuracy
0 KNeighborsClassifier 0.123711 0.90269
1 AdaBoostClassifier 0.265271 0.759015
2 GradientBoostingClassifier 0.218045 0.880939
3 LGBMClassifier 0.247582 0.777333
4 DecisionTreeClassifier 0.215311 0.81225
5 RandomForestClassifier 0.27762 0.854035
6 ExtraTreesClassifier 0.290323 0.823698
7 LinearDiscriminantAnalysis 0.24152 0.680023
8 BernoulliNB 0.251969 0.728105
9 LogisticRegression 0.240109 0.681168
10 SVC 0.228464 0.764167

5.3.2 Compare the helpful score

In [112]:
logdropM2.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Out[112]:
Classifier Log Loss OverfittingRoc BLoss2 BI BO CI CO IR BTOR PR
0 KNeighborsClassifier 2.15947 -0.25 0.0973097 12 0.10084 75 0.0429307 2.34891 6.25 0.89
1 AdaBoostClassifier 0.678083 -0.07 0.240985 76 0.638655 454 0.259874 2.45756 5.97 0.15
2 GradientBoostingClassifier 0.360292 -0.31 0.119061 29 0.243697 147 0.0841442 2.89619 5.07 0.46
3 LGBMClassifier 0.456929 -0.14 0.222667 64 0.537815 398 0.227819 2.36071 6.22 0.17
4 DecisionTreeClassifier 1.06183 -0.26 0.18775 45 0.378151 299 0.171151 2.20947 6.64 0.22
5 RandomForestClassifier 0.385461 -0.23 0.145965 49 0.411765 234 0.133944 3.07416 4.78 0.29
6 ExtraTreesClassifier 0.468454 -0.13 0.176302 63 0.529412 315 0.180309 2.93613 5 0.21
7 LinearDiscriminantAnalysis 0.577582 0.03 0.319977 89 0.747899 618 0.353749 2.11421 6.94 0.11
8 BernoulliNB 0.598566 0.01 0.271895 80 0.672269 516 0.295363 2.27607 6.45 0.13
9 LogisticRegression 0.573504 0.03 0.318832 88 0.739496 614 0.35146 2.10407 6.98 0.11
10 SVC 0.495548 -0.14 0.235833 61 0.512605 415 0.23755 2.15788 6.8 0.16

5.3.3 Compare the scenario KPIs

In [113]:
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')

logCBAdropM.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:


Out[113]:
Classifier MTAB MBPTAB MTAP MBPTAP
0 KNeighborsClassifier 1788 111.86 1967 101.68
1 AdaBoostClassifier 1691 118.27 1860 107.53
2 GradientBoostingClassifier 1494 133.87 1643 121.73
3 LGBMClassifier 1774 112.74 1951 102.51
4 DecisionTreeClassifier 1905 104.99 2096 95.42
5 RandomForestClassifier 1394 143.47 1533 130.46
6 ExtraTreesClassifier 1517 131.84 1668 119.9
7 LinearDiscriminantAnalysis 1988 100.6 2187 91.45
8 BernoulliNB 1808 110.62 1989 100.55
9 LogisticRegression 2051 97.51 2256 88.65
10 SVC 1837 108.87 2020 99.01

5.4 Evaluate the models with the datasets using KFold: Smote X_TraindropM2; y_TraincdropM2

5.4.1 Compare the unhelpful scores

In [97]:
print ('Model Evaluation ')

logUdropMSS.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 ) 
Model Evaluation 
Out[97]:
Classifier F-score Accuracy
0 KNeighborsClassifier 0.163636 0.905204
1 KNeighborsClassifier 0.14876 0.893869
2 KNeighborsClassifier 0.173228 0.891753
3 AdaBoostClassifier 0.210526 0.768161
4 AdaBoostClassifier 0.213992 0.704791
5 AdaBoostClassifier 0.188525 0.693814
6 GradientBoostingClassifier 0.191489 0.84338
7 GradientBoostingClassifier 0.215909 0.857805
8 GradientBoostingClassifier 0.198433 0.841753
9 LGBMClassifier 0.213058 0.764039
10 LGBMClassifier 0.220588 0.726945
11 LGBMClassifier 0.229277 0.774742
12 DecisionTreeClassifier 0.20915 0.812983
13 DecisionTreeClassifier 0.236181 0.84338
14 DecisionTreeClassifier 0.214689 0.856701
15 RandomForestClassifier 0.204276 0.827409
16 RandomForestClassifier 0.271552 0.825863
17 RandomForestClassifier 0.244344 0.827835
18 ExtraTreesClassifier 0.2158 0.790314
19 ExtraTreesClassifier 0.25641 0.790829
20 ExtraTreesClassifier 0.238971 0.786598
21 LinearDiscriminantAnalysis 0.194581 0.66306
22 LinearDiscriminantAnalysis 0.183007 0.613601
23 LinearDiscriminantAnalysis 0.197253 0.668557
24 BernoulliNB 0.213256 0.718702
25 BernoulliNB 0.211248 0.703761
26 BernoulliNB 0.219136 0.739175
27 LogisticRegression 0.200772 0.680062
28 LogisticRegression 0.195065 0.647089
29 LogisticRegression 0.19697 0.672165
30 SVC 0.181818 0.744977
31 SVC 0.211096 0.699639
32 SVC 0.189274 0.735052

5.4.2 Compare the helpful scores

KFold is less perssimistic than holdout(Train_test_split), so the brier_score_loss is bigger than the Train_test_split.

In [98]:
logdropMSS.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Out[98]:
Classifier Log Loss OverfittingRoc BLossS BI BO CI CO IR BTOR PR
0 KNeighborsClassifier 2.14431 0.11 0.0947965 18 0.155172 104 0.0535806 2.89605 5.78 0.17
1 KNeighborsClassifier 2.05548 0.11 0.106131 18 0.155172 126 0.064915 2.39039 7 0.14
2 KNeighborsClassifier 1.89009 0.11 0.108247 22 0.189655 138 0.071134 2.66617 6.27 0.16
3 AdaBoostClassifier 0.680978 0.05 0.231839 60 0.517241 454 0.2339 2.21138 7.57 0.13
4 AdaBoostClassifier 0.681346 -0.01 0.295209 78 0.672414 613 0.315817 2.12913 7.86 0.13
5 AdaBoostClassifier 0.675508 0.05 0.306186 69 0.594828 616 0.317526 1.87332 8.93 0.11
6 GradientBoostingClassifier 0.36589 0.16 0.15662 36 0.310345 260 0.133952 2.31684 7.22 0.14
7 GradientBoostingClassifier 0.371264 0.14 0.142195 38 0.327586 236 0.121587 2.69426 6.21 0.16
8 GradientBoostingClassifier 0.422503 0.12 0.158247 38 0.327586 267 0.137629 2.38021 7.03 0.14
9 LGBMClassifier 0.481679 0.06 0.235961 62 0.534483 466 0.240082 2.22625 7.52 0.13
10 LGBMClassifier 0.491633 0.02 0.273055 75 0.646552 564 0.290572 2.2251 7.52 0.13
11 LGBMClassifier 0.468511 0.05 0.225258 65 0.560345 451 0.232474 2.41035 6.94 0.14
12 DecisionTreeClassifier 1.07855 0.15 0.187017 48 0.413793 343 0.176713 2.34161 7.15 0.14
13 DecisionTreeClassifier 1.10446 0.11 0.15662 47 0.405172 282 0.145286 2.78879 6 0.17
14 DecisionTreeClassifier 0.902707 0.14 0.143299 38 0.327586 238 0.12268 2.67024 6.26 0.16
15 RandomForestClassifier 0.407392 0.17 0.172591 43 0.37069 305 0.157135 2.35904 7.09 0.14
16 RandomForestClassifier 0.408663 0.1 0.174137 63 0.543103 348 0.179289 3.02921 5.52 0.18
17 RandomForestClassifier 0.39892 0.11 0.172165 54 0.465517 326 0.168041 2.77026 6.04 0.17
18 ExtraTreesClassifier 0.479302 0.13 0.209686 56 0.482759 403 0.207625 2.32515 7.2 0.14
19 ExtraTreesClassifier 0.487331 0.08 0.209171 70 0.603448 430 0.221535 2.72394 6.14 0.16
20 ExtraTreesClassifier 0.488101 0.09 0.213402 65 0.560345 428 0.220619 2.53988 6.58 0.15
21 LinearDiscriminantAnalysis 0.610099 0.03 0.33694 79 0.681034 696 0.358578 1.89926 8.81 0.11
22 LinearDiscriminantAnalysis 0.616054 0.02 0.386399 84 0.724138 802 0.413189 1.75256 9.55 0.1
23 LinearDiscriminantAnalysis 0.598339 0.03 0.331443 79 0.681034 685 0.353093 1.92877 8.67 0.12
24 BernoulliNB 0.638718 0.02 0.281298 74 0.637931 578 0.297785 2.14226 7.81 0.13
25 BernoulliNB 0.635346 0.01 0.296239 77 0.663793 613 0.315817 2.10183 7.96 0.13
26 BernoulliNB 0.617688 0.02 0.260825 71 0.612069 532 0.274227 2.23198 7.49 0.13
27 LogisticRegression 0.598778 0.03 0.319938 78 0.672414 661 0.340546 1.97452 8.47 0.12
28 LogisticRegression 0.611036 0.01 0.352911 83 0.715517 735 0.378671 1.88955 8.86 0.11
29 LogisticRegression 0.595817 0.03 0.327835 78 0.672414 676 0.348454 1.92971 8.67 0.12
30 SVC 0.531995 0.16 0.255023 55 0.474138 489 0.251932 1.88201 8.89 0.11
31 SVC 0.498762 0.09 0.300361 78 0.672414 623 0.320969 2.09495 7.99 0.13
32 SVC 0.523485 0.13 0.264948 60 0.517241 518 0.26701 1.93716 8.63 0.12

5.4.3 Compare the scenario KPIs

In [109]:
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')

logCBAdropMSS.style.set_table_styles(
   [{'selector': 'tr:hover',
       'props': [('background-color', 'yellow')]}]
 )
Business Application of the Model: 

Let´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. 
In summary, the given KPIs of the insurance company are: 

Budget = 200000 
RevenuePerBenefitItem = 700 
ProfitGoal = 20000

Then to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:


Out[109]:
Classifier MTAB MBPTAB MTAP MBPTAP
0 KNeighborsClassifier 1428 140.06 1571 127.31
1 KNeighborsClassifier 2354 84.96 2589 77.25
2 KNeighborsClassifier 2342 85.4 2577 77.61
3 AdaBoostClassifier 2134 93.72 2347 85.22
4 AdaBoostClassifier 2271 88.07 2498 80.06
5 AdaBoostClassifier 2422 82.58 2665 75.05
6 GradientBoostingClassifier 1985 100.76 2184 91.58
7 GradientBoostingClassifier 1688 118.48 1857 107.7
8 GradientBoostingClassifier 2088 95.79 2297 87.07
9 LGBMClassifier 2177 91.87 2394 83.54
10 LGBMClassifier 2222 90.01 2445 81.8
11 LGBMClassifier 2228 89.77 2451 81.6
12 DecisionTreeClassifier 2122 94.25 2335 85.65
13 DecisionTreeClassifier 2471 80.94 2718 73.58
14 DecisionTreeClassifier 1945 102.83 2140 93.46
15 RandomForestClassifier 1868 107.07 2055 97.32
16 RandomForestClassifier 1597 125.23 1756 113.9
17 RandomForestClassifier 1914 104.49 2105 95.01
18 ExtraTreesClassifier 2077 96.29 2284 87.57
19 ExtraTreesClassifier 1717 116.48 1888 105.93
20 ExtraTreesClassifier 1825 109.59 2008 99.6
21 LinearDiscriminantAnalysis 2448 81.7 2693 74.27
22 LinearDiscriminantAnalysis 2671 74.88 2938 68.07
23 LinearDiscriminantAnalysis 2497 80.1 2746 72.83
24 BernoulliNB 2231 89.65 2454 81.5
25 BernoulliNB 2282 87.64 2511 79.65
26 BernoulliNB 2171 92.12 2388 83.75
27 LogisticRegression 2477 80.74 2724 73.42
28 LogisticRegression 2540 78.74 2794 71.58
29 LogisticRegression 2494 80.19 2743 72.91
30 SVC 2457 81.4 2702 74.02
31 SVC 2077 96.29 2284 87.57
32 SVC 2705 73.94 2976 67.2

5.5 Visualise the Amount of the Benefit Items and the Cost Items

5.5.1 Visualise the Amount of the Benefit Items and the Cost Items: Smote X_TraindropM; y_TraindropM

In [61]:
#log_colsselecdropM=["Classifier", "Log Loss","BI","BO", "CI", "CO", "IR","BTOR"] 

#log_colsCBAselecdropM =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
sns.set(rc={'figure.figsize':(7.27,5.27)})

sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logdropM, color="g")

plt.xlabel('BI')
plt.title('Amount of Benefit Items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logdropM, color="r")

plt.xlabel('CI')
plt.title('Amount of cost items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logdropM, color="g")

plt.xlabel('BO %')
plt.title('Benefit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logdropM, color="b")

plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logdropM, color="g")

plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logdropM, color="b")

plt.xlabel('Log Loss')
plt.title('Log Loss')
plt.show()

5.5.2 Visualise the Amount of the Benefit Items and the Cost Items: Smote X_TrainselecdropM; y_TrainselecdropM

In [62]:
sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logselecdropM, color="g")

plt.xlabel('BI')
plt.title('Classifier Amount of Benefit Items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logselecdropM, color="r")

plt.xlabel('CI')
plt.title('Classifier Amount of cost items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logselecdropM, color="g")

plt.xlabel('BO %')
plt.title('Classifier Benefit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logselecdropM, color="b")

plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logselecdropM, color="g")

plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logselecdropM, color="b")

plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()

5.5.3 Visualise the Amount of the Benefit Items and the Cost Items: Smote X_TraindropM2; y_TrainselecdropM2

In [63]:
sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logdropM2, color="g")

plt.xlabel('BI')
plt.title('Classifier Amount of Benefit Items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logdropM2, color="r")

plt.xlabel('CI')
plt.title('Classifier Amount of cost items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logdropM2, color="g")

plt.xlabel('BO %')
plt.title('Classifier Benefit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logdropM2, color="b")

plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logdropM2, color="g")

plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logdropM2, color="b")

plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()

5.5.4 Visualise the Amount of the Benefit Items and the Cost Items: KFold with Smote X_TraindropM; y_TraindropM

In [80]:
sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logdropMSS, color="g")

plt.xlabel('BI')
plt.title('Classifier Amount of Benefit Items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logdropMSS, color="r")

plt.xlabel('CI')
plt.title('Classifier Amount of cost items')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logdropMSS, color="g")

plt.xlabel('BO %')
plt.title('Classifier Benefit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logdropMSS, color="b")

plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logdropMSS, color="g")

plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()

sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logdropMSS, color="b")

plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()

6 Persona of the target group: Plotting the Dependency of prefering Caravan Policy

  • Contribution to Car policy with 6
  • high purchase power
  • high Contribution private third party insurance
  • has contribution to boat policies

6.1 Plotting the Dependency of prefering Caravan Policy based on Contribution of car policies

In [65]:
#Train['C68APERSAUT'].value_counts().plot(kind='bar', color='steelblue', grid=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('count')

num_car_caravan = pd.crosstab(Train['C47PPERSAUT'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution of car policies')
plt.ylabel('caravan')


num_car_caravan = pd.crosstab(Train['C47PPERSAUT'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution of car policies')
plt.ylabel('caravan')

#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C47PPERSAUT'].value_counts()
print pd.crosstab(Train['C47PPERSAUT'], Train['C86CARAVAN'])
0    2845
6    2319
5     613
7      41
8       3
4       1
Name: C47PPERSAUT, dtype: int64
C86CARAVAN      0    1
C47PPERSAUT           
0            2773   72
4               1    0
5             599   14
6            2057  262
7              41    0
8               3    0

6.2 Plotting the Dependency of prefering Caravan Policy based on Purchasing power class

In [66]:
num_car_caravan = pd.crosstab(Train['C43MKOOPKLA'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Purchasing power class')
plt.ylabel('caravan')


num_car_caravan = pd.crosstab(Train['C43MKOOPKLA'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Purchasing power class')
plt.ylabel('caravan')

#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C43MKOOPKLA'].value_counts()
print pd.crosstab(Train['C43MKOOPKLA'], Train['C86CARAVAN'])
3    1524
4     902
6     901
1     587
5     583
7     474
8     426
2     425
Name: C43MKOOPKLA, dtype: int64
C86CARAVAN      0   1
C43MKOOPKLA          
1             569  18
2             410  15
3            1453  71
4             856  46
5             553  30
6             835  66
7             407  67
8             391  35

6.3 Plotting the Dependency of prefering Caravan Policy based on Contribution private third party insurance

In [67]:
num_car_caravan = pd.crosstab(Train['C44PWAPART'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution private third party insurance')
plt.ylabel('caravan')


num_car_caravan = pd.crosstab(Train['C44PWAPART'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution private third party insurance')
plt.ylabel('caravan')

#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C44PWAPART'].value_counts()
print pd.crosstab(Train['C44PWAPART'], Train['C86CARAVAN'])
0    3482
2    2128
1     201
3      11
Name: C44PWAPART, dtype: int64
C86CARAVAN     0    1
C44PWAPART           
0           3335  147
1            193    8
2           1937  191
3              9    2

6.4 Plotting the Dependency of prefering Caravan Policy based on Contribution boat policies

In [68]:
num_car_caravan = pd.crosstab(Train['C61PPLEZIER'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel(' Contribution boat policies')
plt.ylabel('caravan')


num_car_caravan = pd.crosstab(Train['C61PPLEZIER'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution boat policies')
plt.ylabel('caravan')

#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C61PPLEZIER'].value_counts()
print pd.crosstab(Train['C61PPLEZIER'], Train['C86CARAVAN'])
0    5789
4      13
3       5
2       5
1       5
6       3
5       2
Name: C61PPLEZIER, dtype: int64
C86CARAVAN      0    1
C61PPLEZIER           
0            5454  335
1               2    3
2               3    2
3               3    2
4               9    4
5               2    0
6               1    2